Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
67// TODO: This option should be removed once we switch to always using PTRADD in
68// the SelectionDAG.
70 "amdgpu-use-sdag-ptradd", cl::Hidden,
71 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
72 "SelectionDAG ISel"),
73 cl::init(false));
74
77 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
78}
79
82 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
83}
84
85static unsigned findFirstFreeSGPR(CCState &CCInfo) {
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
89 return AMDGPU::SGPR0 + Reg;
90 }
91 }
92 llvm_unreachable("Cannot allocate sgpr");
93}
94
96 const GCNSubtarget &STI)
97 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
98 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
99 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
100
101 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
102 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
103
104 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
105
106 const SIRegisterInfo *TRI = STI.getRegisterInfo();
107 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
108
109 addRegisterClass(MVT::f64, V64RegClass);
110 addRegisterClass(MVT::v2f32, V64RegClass);
111 addRegisterClass(MVT::Untyped, V64RegClass);
112
113 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
114 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
115
116 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
118
119 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
120 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
121
122 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
123 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
124
125 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
127
128 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
129 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
130
131 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
132 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
136
137 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
138 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
139
140 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
141 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
142
143 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
144 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
151
152 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
154
155 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
156 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
157
158 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
159 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
160
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
163 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
165 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
166 } else {
167 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
170 }
171
172 // Unless there are also VOP3P operations, not operations are really legal.
173 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
176 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
179 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
182 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
185 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
187 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
188 }
189
190 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
191 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
192
193 computeRegisterProperties(Subtarget->getRegisterInfo());
194
195 // The boolean content concept here is too inflexible. Compares only ever
196 // really produce a 1-bit result. Any copy/extend from these will turn into a
197 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
198 // it's what most targets use.
201
202 // We need to custom lower vector stores from local memory
203 setOperationAction(ISD::LOAD,
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
208 Custom);
209
210 setOperationAction(ISD::STORE,
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
217 if (isTypeLegal(MVT::bf16)) {
218 for (unsigned Opc :
220 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
226 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
227 ISD::SETCC}) {
228 // FIXME: The promoted to type shouldn't need to be explicit
229 setOperationAction(Opc, MVT::bf16, Promote);
230 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
231 }
232
234
236 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
237
238 setOperationAction(ISD::FABS, MVT::bf16, Legal);
239 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
241
242 // We only need to custom lower because we can't specify an action for bf16
243 // sources.
246 }
247
248 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
249 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
250 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
251 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
252 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
253 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
254 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
259 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
264
265 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
266 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
267 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
271 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
272
273 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
274
278 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
279
280 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
281
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
284
286 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
287 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
288
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
293 Expand);
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
298 Expand);
299
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
303 Custom);
304
305 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
306 setOperationAction(ISD::BR_CC,
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
308
310
312
314 Expand);
315
316#if 0
318#endif
319
320 // We only support LOAD/STORE and vector manipulation ops for vectors
321 // with > 4 elements.
322 for (MVT VT :
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
331 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
332 switch (Op) {
333 case ISD::LOAD:
334 case ISD::STORE:
336 case ISD::BITCAST:
337 case ISD::UNDEF:
341 case ISD::IS_FPCLASS:
342 break;
347 break;
348 default:
350 break;
351 }
352 }
353 }
354
355 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
356
357 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
358 // is expanded to avoid having two separate loops in case the index is a VGPR.
359
360 // Most operations are naturally 32-bit vector operations. We only support
361 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
362 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
364 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
374 }
375
376 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
378 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
388 }
389
390 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
392 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
402 }
403
404 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
406 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
416 }
417
418 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
420 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
430 }
431
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
435 Custom);
436
437 if (Subtarget->hasPkMovB32()) {
438 // TODO: 16-bit element vectors should be legal with even aligned elements.
439 // TODO: Can be legal with wider source types than the result with
440 // subregister extracts.
441 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
442 }
443
445 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
446 // instead lower to cndmask in SITargetLowering::LowerSELECT().
448 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
449 // alignbit.
450 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
451
452 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
453 Custom);
454
455 // Avoid stack access for these.
456 // TODO: Generalize to more vector types.
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
460 Custom);
461
462 // Deal with vec3 vector operations when widened to vec4.
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
465
466 // Deal with vec5/6/7 vector operations when widened to vec8.
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
472 Custom);
473
474 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
475 // and output demarshalling
476 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
477
478 // We can't return success/failure, only the old value,
479 // let LLVM add the comparison
480 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
481 Expand);
482
483 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
484
485 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
486
487 // FIXME: This should be narrowed to i32, but that only happens if i64 is
488 // illegal.
489 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
490 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
491
492 // On SI this is s_memtime and s_memrealtime on VI.
493 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
494
495 if (Subtarget->hasSMemRealTime() ||
496 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
497 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
498 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
499
500 if (Subtarget->has16BitInsts()) {
501 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
502 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
503 } else {
504 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
505 }
506
507 if (Subtarget->hasMadMacF32Insts())
509
510 if (!Subtarget->hasBFI())
511 // fcopysign can be done in a single instruction with BFI.
512 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
513
514 if (!Subtarget->hasBCNT(32))
516
517 if (!Subtarget->hasBCNT(64))
519
520 if (Subtarget->hasFFBH())
522
523 if (Subtarget->hasFFBL())
525
526 // We only really have 32-bit BFE instructions (and 16-bit on VI).
527 //
528 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
529 // effort to match them now. We want this to be false for i64 cases when the
530 // extraction isn't restricted to the upper or lower half. Ideally we would
531 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
532 // span the midpoint are probably relatively rare, so don't worry about them
533 // for now.
534 if (Subtarget->hasBFE())
536
537 // Clamp modifier on add/sub
538 if (Subtarget->hasIntClamp())
540
541 if (Subtarget->hasAddNoCarry())
542 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
543 Legal);
544
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64}, Custom);
548
549 // These are really only legal for ieee_mode functions. We should be avoiding
550 // them for functions that don't have ieee_mode enabled, so just say they are
551 // legal.
552 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
553 {MVT::f32, MVT::f64}, Legal);
554
555 if (Subtarget->haveRoundOpsF64())
556 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
557 Legal);
558 else
559 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
560 MVT::f64, Custom);
561
562 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
563 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
564 Legal);
565 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
566
567 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
569
570 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
576 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
577
578 if (Subtarget->has16BitInsts()) {
581 MVT::i16, Legal);
582
583 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
584
586 MVT::i16, Expand);
587
591 ISD::CTPOP},
592 MVT::i16, Promote);
593
594 setOperationAction(ISD::LOAD, MVT::i16, Custom);
595
596 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
597
598 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
599 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
601 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
602
606
608
609 // F16 - Constant Actions.
612
613 // F16 - Load/Store Actions.
614 setOperationAction(ISD::LOAD, MVT::f16, Promote);
615 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 setOperationAction(ISD::STORE, MVT::f16, Promote);
617 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
618
619 // BF16 - Load/Store Actions.
620 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
621 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 setOperationAction(ISD::STORE, MVT::bf16, Promote);
623 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
624
625 // F16 - VOP1 Actions.
627 ISD::FSIN, ISD::FROUND},
628 MVT::f16, Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
632 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
633
636
637 // F16 - VOP2 Actions.
638 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
639 Expand);
640 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
641 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
643
644 // F16 - VOP3 Actions.
646 if (STI.hasMadF16())
648
649 for (MVT VT :
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
653 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
654 switch (Op) {
655 case ISD::LOAD:
656 case ISD::STORE:
658 case ISD::BITCAST:
659 case ISD::UNDEF:
664 case ISD::IS_FPCLASS:
665 break;
669 break;
670 default:
672 break;
673 }
674 }
675 }
676
677 // v_perm_b32 can handle either of these.
678 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
680
681 // XXX - Do these do anything? Vector constants turn into build_vector.
682 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
683
684 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
685 Legal);
686
687 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
688 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
689 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
690 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
691
692 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
694 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
695 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
696
697 setOperationAction(ISD::AND, MVT::v2i16, Promote);
698 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
699 setOperationAction(ISD::OR, MVT::v2i16, Promote);
700 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
701 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
702 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
703
704 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
705 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
706 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
708 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
710
711 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
712 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
713 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
714 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
715 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
717
718 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
719 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
720 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
722 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
724
725 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
727 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
728 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
729
730 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
732 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
734 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
735 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
736
737 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
738 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
739 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
740 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
741 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
742 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
743
744 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
745 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
746 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
747 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
748 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
749 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
750
751 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
752 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
753 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
754 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
755 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
756 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
757
758 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
759 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
760 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
761 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
762 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
763 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
764
766 MVT::v2i32, Expand);
767 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
768
770 MVT::v4i32, Expand);
771
773 MVT::v8i32, Expand);
774
775 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
776 Subtarget->hasVOP3PInsts() ? Legal : Custom);
777
778 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
779 // This isn't really legal, but this avoids the legalizer unrolling it (and
780 // allows matching fneg (fabs x) patterns)
781 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
782
783 // Can do this in one BFI plus a constant materialize.
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
788 Custom);
789
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
792 MVT::f16, Custom);
793 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
794
795 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
796 ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
798 Custom);
799
800 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
802 Expand);
803
804 for (MVT Vec16 :
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
809 Vec16, Custom);
811 }
812 }
813
814 if (Subtarget->hasVOP3PInsts()) {
818 MVT::v2i16, Legal);
819
820 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
821 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
822 MVT::v2f16, Legal);
823
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
826
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
831 Custom);
832
833 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
834 // Split vector operations.
839 VT, Custom);
840
841 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
842 // Split vector operations.
844 VT, Custom);
845
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16}, Custom);
849
850 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
851 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
852 Custom);
853
854 if (Subtarget->hasBF16PackedInsts()) {
855 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
856 // Split vector operations.
858 VT, Custom);
859 }
860
861 if (Subtarget->hasPackedFP32Ops()) {
863 MVT::v2f32, Legal);
865 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
866 Custom);
867 }
868 }
869
870 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
871
872 if (Subtarget->has16BitInsts()) {
874 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
876 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
877 } else {
878 // Legalization hack.
879 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
880
881 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
882 }
883
885 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
886 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
887 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
888 MVT::v32f16, MVT::v32bf16},
889 Custom);
890
892
893 if (Subtarget->hasVectorMulU64())
895 else if (Subtarget->hasScalarSMulU64())
897
898 if (Subtarget->hasMad64_32())
900
901 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
902 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
903
904 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
905 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
906 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
907 } else {
908 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
909 if (Subtarget->hasMinimum3Maximum3F32())
910 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
911
912 if (Subtarget->hasMinimum3Maximum3PKF16()) {
913 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
914
915 // If only the vector form is available, we need to widen to a vector.
916 if (!Subtarget->hasMinimum3Maximum3F16())
917 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
918 }
919 }
920
921 if (Subtarget->hasVOP3PInsts()) {
922 // We want to break these into v2f16 pieces, not scalarize.
923 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
924 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
925 Custom);
926 }
927
928 if (Subtarget->hasIntMinMax64())
930 Legal);
931
933 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
934 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
935 MVT::i8},
936 Custom);
937
939 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
940 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
941 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
942 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
943 Custom);
944
946 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
947 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
948 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
949 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
950 Custom);
951
952 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
954 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
955 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
956 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
957
958 // TODO: Could move this to custom lowering, could benefit from combines on
959 // extract of relevant bits.
960 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
961
963
964 if (Subtarget->hasBF16ConversionInsts()) {
965 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
967 }
968
969 if (Subtarget->hasBF16PackedInsts()) {
971 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
972 MVT::v2bf16, Legal);
973 }
974
975 if (Subtarget->hasBF16TransInsts()) {
976 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
977 }
978
979 if (Subtarget->hasCvtPkF16F32Inst()) {
981 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
982 Custom);
983 }
984
986 ISD::PTRADD,
988 ISD::SUB,
990 ISD::MUL,
991 ISD::FADD,
992 ISD::FSUB,
993 ISD::FDIV,
994 ISD::FMUL,
995 ISD::FMINNUM,
996 ISD::FMAXNUM,
997 ISD::FMINNUM_IEEE,
998 ISD::FMAXNUM_IEEE,
999 ISD::FMINIMUM,
1000 ISD::FMAXIMUM,
1001 ISD::FMINIMUMNUM,
1002 ISD::FMAXIMUMNUM,
1003 ISD::FMA,
1004 ISD::SMIN,
1005 ISD::SMAX,
1006 ISD::UMIN,
1007 ISD::UMAX,
1008 ISD::SETCC,
1010 ISD::SMIN,
1011 ISD::SMAX,
1012 ISD::UMIN,
1013 ISD::UMAX,
1014 ISD::AND,
1015 ISD::OR,
1016 ISD::XOR,
1017 ISD::SHL,
1018 ISD::SRL,
1019 ISD::SRA,
1020 ISD::FSHR,
1030
1031 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1033
1034 // All memory operations. Some folding on the pointer operand is done to help
1035 // matching the constant offsets in the addressing modes.
1036 setTargetDAGCombine({ISD::LOAD,
1037 ISD::STORE,
1038 ISD::ATOMIC_LOAD,
1039 ISD::ATOMIC_STORE,
1040 ISD::ATOMIC_CMP_SWAP,
1041 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1042 ISD::ATOMIC_SWAP,
1043 ISD::ATOMIC_LOAD_ADD,
1044 ISD::ATOMIC_LOAD_SUB,
1045 ISD::ATOMIC_LOAD_AND,
1046 ISD::ATOMIC_LOAD_OR,
1047 ISD::ATOMIC_LOAD_XOR,
1048 ISD::ATOMIC_LOAD_NAND,
1049 ISD::ATOMIC_LOAD_MIN,
1050 ISD::ATOMIC_LOAD_MAX,
1051 ISD::ATOMIC_LOAD_UMIN,
1052 ISD::ATOMIC_LOAD_UMAX,
1053 ISD::ATOMIC_LOAD_FADD,
1054 ISD::ATOMIC_LOAD_FMIN,
1055 ISD::ATOMIC_LOAD_FMAX,
1056 ISD::ATOMIC_LOAD_UINC_WRAP,
1057 ISD::ATOMIC_LOAD_UDEC_WRAP,
1060
1061 // FIXME: In other contexts we pretend this is a per-function property.
1063
1065}
1066
1067const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1068
1070 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1071 return RCRegs;
1072}
1073
1074//===----------------------------------------------------------------------===//
1075// TargetLowering queries
1076//===----------------------------------------------------------------------===//
1077
1078// v_mad_mix* support a conversion from f16 to f32.
1079//
1080// There is only one special case when denormals are enabled we don't currently,
1081// where this is OK to use.
1082bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1083 EVT DestVT, EVT SrcVT) const {
1084 return DestVT.getScalarType() == MVT::f32 &&
1085 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1086 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1087 SrcVT.getScalarType() == MVT::f16) ||
1088 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1089 SrcVT.getScalarType() == MVT::bf16)) &&
1090 // TODO: This probably only requires no input flushing?
1092}
1093
1095 LLT DestTy, LLT SrcTy) const {
1096 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1097 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1098 DestTy.getScalarSizeInBits() == 32 &&
1099 SrcTy.getScalarSizeInBits() == 16 &&
1100 // TODO: This probably only requires no input flushing?
1101 denormalModeIsFlushAllF32(*MI.getMF());
1102}
1103
1105 // SI has some legal vector types, but no legal vector operations. Say no
1106 // shuffles are legal in order to prefer scalarizing some vector operations.
1107 return false;
1108}
1109
1111 CallingConv::ID CC,
1112 EVT VT) const {
1114 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1115
1116 if (VT.isVector()) {
1117 EVT ScalarVT = VT.getScalarType();
1118 unsigned Size = ScalarVT.getSizeInBits();
1119 if (Size == 16) {
1120 if (Subtarget->has16BitInsts()) {
1121 if (VT.isInteger())
1122 return MVT::v2i16;
1123 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1124 }
1125 return VT.isInteger() ? MVT::i32 : MVT::f32;
1126 }
1127
1128 if (Size < 16)
1129 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1130 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1131 }
1132
1133 if (VT.getSizeInBits() > 32)
1134 return MVT::i32;
1135
1136 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1137}
1138
1140 CallingConv::ID CC,
1141 EVT VT) const {
1143 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1144
1145 if (VT.isVector()) {
1146 unsigned NumElts = VT.getVectorNumElements();
1147 EVT ScalarVT = VT.getScalarType();
1148 unsigned Size = ScalarVT.getSizeInBits();
1149
1150 // FIXME: Should probably promote 8-bit vectors to i16.
1151 if (Size == 16 && Subtarget->has16BitInsts())
1152 return (NumElts + 1) / 2;
1153
1154 if (Size <= 32)
1155 return NumElts;
1156
1157 if (Size > 32)
1158 return NumElts * ((Size + 31) / 32);
1159 } else if (VT.getSizeInBits() > 32)
1160 return (VT.getSizeInBits() + 31) / 32;
1161
1162 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1163}
1164
1166 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1167 unsigned &NumIntermediates, MVT &RegisterVT) const {
1168 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1169 unsigned NumElts = VT.getVectorNumElements();
1170 EVT ScalarVT = VT.getScalarType();
1171 unsigned Size = ScalarVT.getSizeInBits();
1172 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1173 // support, but unless we can properly handle 3-vectors, it will be still be
1174 // inconsistent.
1175 if (Size == 16 && Subtarget->has16BitInsts()) {
1176 if (ScalarVT == MVT::bf16) {
1177 RegisterVT = MVT::i32;
1178 IntermediateVT = MVT::v2bf16;
1179 } else {
1180 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1181 IntermediateVT = RegisterVT;
1182 }
1183 NumIntermediates = (NumElts + 1) / 2;
1184 return NumIntermediates;
1185 }
1186
1187 if (Size == 32) {
1188 RegisterVT = ScalarVT.getSimpleVT();
1189 IntermediateVT = RegisterVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1192 }
1193
1194 if (Size < 16 && Subtarget->has16BitInsts()) {
1195 // FIXME: Should probably form v2i16 pieces
1196 RegisterVT = MVT::i16;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size != 16 && Size <= 32) {
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = ScalarVT;
1205 NumIntermediates = NumElts;
1206 return NumIntermediates;
1207 }
1208
1209 if (Size > 32) {
1210 RegisterVT = MVT::i32;
1211 IntermediateVT = RegisterVT;
1212 NumIntermediates = NumElts * ((Size + 31) / 32);
1213 return NumIntermediates;
1214 }
1215 }
1216
1218 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1219}
1220
1222 const DataLayout &DL, Type *Ty,
1223 unsigned MaxNumLanes) {
1224 assert(MaxNumLanes != 0);
1225
1226 LLVMContext &Ctx = Ty->getContext();
1227 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1228 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1229 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1230 NumElts);
1231 }
1232
1233 return TLI.getValueType(DL, Ty);
1234}
1235
1236// Peek through TFE struct returns to only use the data size.
1238 const DataLayout &DL, Type *Ty,
1239 unsigned MaxNumLanes) {
1240 auto *ST = dyn_cast<StructType>(Ty);
1241 if (!ST)
1242 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1243
1244 // TFE intrinsics return an aggregate type.
1245 assert(ST->getNumContainedTypes() == 2 &&
1246 ST->getContainedType(1)->isIntegerTy(32));
1247 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1248}
1249
1250/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1251/// in-memory representation. This return value is a custom type because there
1252/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1253/// could cause issues during codegen, these address space 7 pointers will be
1254/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1255/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1256/// for cost modeling, to work. (This also sets us up decently for doing the
1257/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1259 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1260 return MVT::amdgpuBufferFatPointer;
1262 DL.getPointerSizeInBits(AS) == 192)
1263 return MVT::amdgpuBufferStridedPointer;
1265}
1266/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1267/// v8i32 when padding is added.
1268/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1269/// also v8i32 with padding.
1271 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1272 DL.getPointerSizeInBits(AS) == 160) ||
1274 DL.getPointerSizeInBits(AS) == 192))
1275 return MVT::v8i32;
1277}
1278
1279static unsigned getIntrMemWidth(unsigned IntrID) {
1280 switch (IntrID) {
1281 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1282 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1283 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1284 return 8;
1285 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1286 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1287 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1288 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1289 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1290 return 32;
1291 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1292 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1293 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1294 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1295 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1296 return 64;
1297 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1298 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1299 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1300 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1301 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1302 return 128;
1303 default:
1304 llvm_unreachable("Unknown width");
1305 }
1306}
1307
1308static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1310 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1311 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1312 switch (AtomicOrderingCABI(Ord)) {
1315 break;
1318 break;
1321 break;
1322 default:
1324 break;
1325 }
1326
1327 Info.flags =
1329 Info.flags |= MOCooperative;
1330
1331 MDNode *ScopeMD = cast<MDNode>(
1332 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1333 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1334 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1335}
1336
1338 const CallInst &CI,
1339 MachineFunction &MF,
1340 unsigned IntrID) const {
1341 Info.flags = MachineMemOperand::MONone;
1342 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1343 Info.flags |= MachineMemOperand::MOInvariant;
1344 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1346 Info.flags |= getTargetMMOFlags(CI);
1347
1348 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1350 AttributeSet Attr =
1352 MemoryEffects ME = Attr.getMemoryEffects();
1353 if (ME.doesNotAccessMemory())
1354 return false;
1355
1356 // TODO: Should images get their own address space?
1357 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1358
1359 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1360 if (RsrcIntr->IsImage) {
1361 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1363 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1364 Info.align.reset();
1365 }
1366
1367 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1368 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1369 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1370 // We conservatively set the memory operand of a buffer intrinsic to the
1371 // base resource pointer, so that we can access alias information about
1372 // those pointers. Cases like "this points at the same value
1373 // but with a different offset" are handled in
1374 // areMemAccessesTriviallyDisjoint.
1375 Info.ptrVal = RsrcArg;
1376 }
1377
1378 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1379 if (!IsSPrefetch) {
1380 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1381 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1382 Info.flags |= MachineMemOperand::MOVolatile;
1383 }
1384
1386 if (ME.onlyReadsMemory()) {
1387 if (RsrcIntr->IsImage) {
1388 unsigned MaxNumLanes = 4;
1389
1390 if (!BaseOpcode->Gather4) {
1391 // If this isn't a gather, we may have excess loaded elements in the
1392 // IR type. Check the dmask for the real number of elements loaded.
1393 unsigned DMask =
1394 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1395 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1396 }
1397
1398 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1399 CI.getType(), MaxNumLanes);
1400 } else {
1401 Info.memVT =
1403 std::numeric_limits<unsigned>::max());
1404 }
1405
1406 // FIXME: What does alignment mean for an image?
1407 Info.opc = ISD::INTRINSIC_W_CHAIN;
1408 Info.flags |= MachineMemOperand::MOLoad;
1409 } else if (ME.onlyWritesMemory()) {
1410 Info.opc = ISD::INTRINSIC_VOID;
1411
1412 Type *DataTy = CI.getArgOperand(0)->getType();
1413 if (RsrcIntr->IsImage) {
1414 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1415 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1416 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1417 DMaskLanes);
1418 } else
1419 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1420
1421 Info.flags |= MachineMemOperand::MOStore;
1422 } else {
1423 // Atomic, NoReturn Sampler or prefetch
1424 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1426 Info.flags |=
1428
1429 if (!IsSPrefetch)
1430 Info.flags |= MachineMemOperand::MOStore;
1431
1432 switch (IntrID) {
1433 default:
1434 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1435 // Fake memory access type for no return sampler intrinsics
1436 Info.memVT = MVT::i32;
1437 } else {
1438 // XXX - Should this be volatile without known ordering?
1439 Info.flags |= MachineMemOperand::MOVolatile;
1440 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1441 }
1442 break;
1443 case Intrinsic::amdgcn_raw_buffer_load_lds:
1444 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1445 case Intrinsic::amdgcn_struct_buffer_load_lds:
1446 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1447 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1448 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1449 Info.ptrVal = CI.getArgOperand(1);
1450 return true;
1451 }
1452 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1453 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1454 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1455 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1456 Info.memVT =
1458 std::numeric_limits<unsigned>::max());
1459 Info.flags &= ~MachineMemOperand::MOStore;
1460 return true;
1461 }
1462 }
1463 }
1464 return true;
1465 }
1466
1467 switch (IntrID) {
1468 case Intrinsic::amdgcn_ds_ordered_add:
1469 case Intrinsic::amdgcn_ds_ordered_swap: {
1470 Info.opc = ISD::INTRINSIC_W_CHAIN;
1471 Info.memVT = MVT::getVT(CI.getType());
1472 Info.ptrVal = CI.getOperand(0);
1473 Info.align.reset();
1475
1476 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1477 if (!Vol->isZero())
1478 Info.flags |= MachineMemOperand::MOVolatile;
1479
1480 return true;
1481 }
1482 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1483 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1484 Info.opc = ISD::INTRINSIC_W_CHAIN;
1485 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1486 Info.ptrVal = nullptr;
1487 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1489 return true;
1490 }
1491 case Intrinsic::amdgcn_ds_append:
1492 case Intrinsic::amdgcn_ds_consume: {
1493 Info.opc = ISD::INTRINSIC_W_CHAIN;
1494 Info.memVT = MVT::getVT(CI.getType());
1495 Info.ptrVal = CI.getOperand(0);
1496 Info.align.reset();
1498
1499 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1500 if (!Vol->isZero())
1501 Info.flags |= MachineMemOperand::MOVolatile;
1502
1503 return true;
1504 }
1505 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1506 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1507 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1510 Info.memVT = MVT::getVT(CI.getType());
1511 Info.ptrVal = CI.getOperand(0);
1512 Info.memVT = MVT::i64;
1513 Info.size = 8;
1514 Info.align.reset();
1516 return true;
1517 }
1518 case Intrinsic::amdgcn_global_atomic_csub: {
1519 Info.opc = ISD::INTRINSIC_W_CHAIN;
1520 Info.memVT = MVT::getVT(CI.getType());
1521 Info.ptrVal = CI.getOperand(0);
1522 Info.align.reset();
1525 return true;
1526 }
1527 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1528 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1529 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1530 Info.opc = ISD::INTRINSIC_W_CHAIN;
1531 Info.memVT =
1532 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1533 ? CI.getType()
1535 ->getElementType(0)); // XXX: what is correct VT?
1536
1537 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1538 Info.align.reset();
1539 Info.flags |=
1541 return true;
1542 }
1543 case Intrinsic::amdgcn_global_atomic_fmin_num:
1544 case Intrinsic::amdgcn_global_atomic_fmax_num:
1545 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1546 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1547 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1548 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1549 Info.opc = ISD::INTRINSIC_W_CHAIN;
1550 Info.memVT = MVT::getVT(CI.getType());
1551 Info.ptrVal = CI.getOperand(0);
1552 Info.align.reset();
1556 return true;
1557 }
1558 case Intrinsic::amdgcn_flat_load_monitor_b32:
1559 case Intrinsic::amdgcn_flat_load_monitor_b64:
1560 case Intrinsic::amdgcn_flat_load_monitor_b128:
1561 case Intrinsic::amdgcn_global_load_monitor_b32:
1562 case Intrinsic::amdgcn_global_load_monitor_b64:
1563 case Intrinsic::amdgcn_global_load_monitor_b128:
1564 case Intrinsic::amdgcn_cluster_load_b32:
1565 case Intrinsic::amdgcn_cluster_load_b64:
1566 case Intrinsic::amdgcn_cluster_load_b128:
1567 case Intrinsic::amdgcn_ds_load_tr6_b96:
1568 case Intrinsic::amdgcn_ds_load_tr4_b64:
1569 case Intrinsic::amdgcn_ds_load_tr8_b64:
1570 case Intrinsic::amdgcn_ds_load_tr16_b128:
1571 case Intrinsic::amdgcn_global_load_tr6_b96:
1572 case Intrinsic::amdgcn_global_load_tr4_b64:
1573 case Intrinsic::amdgcn_global_load_tr_b64:
1574 case Intrinsic::amdgcn_global_load_tr_b128:
1575 case Intrinsic::amdgcn_ds_read_tr4_b64:
1576 case Intrinsic::amdgcn_ds_read_tr6_b96:
1577 case Intrinsic::amdgcn_ds_read_tr8_b64:
1578 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1579 Info.opc = ISD::INTRINSIC_W_CHAIN;
1580 Info.memVT = MVT::getVT(CI.getType());
1581 Info.ptrVal = CI.getOperand(0);
1582 Info.align.reset();
1583 Info.flags |= MachineMemOperand::MOLoad;
1584 return true;
1585 }
1586 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1587 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1588 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1589 Info.opc = ISD::INTRINSIC_W_CHAIN;
1590 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1591 Info.ptrVal = CI.getOperand(0);
1592 Info.align.reset();
1593 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1594 return true;
1595 }
1596 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1597 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1598 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1599 Info.opc = ISD::INTRINSIC_VOID;
1600 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1601 Info.ptrVal = CI.getArgOperand(0);
1602 Info.align.reset();
1603 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1604 return true;
1605 }
1606 case Intrinsic::amdgcn_ds_gws_init:
1607 case Intrinsic::amdgcn_ds_gws_barrier:
1608 case Intrinsic::amdgcn_ds_gws_sema_v:
1609 case Intrinsic::amdgcn_ds_gws_sema_br:
1610 case Intrinsic::amdgcn_ds_gws_sema_p:
1611 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1612 Info.opc = ISD::INTRINSIC_VOID;
1613
1614 const GCNTargetMachine &TM =
1615 static_cast<const GCNTargetMachine &>(getTargetMachine());
1616
1618 Info.ptrVal = MFI->getGWSPSV(TM);
1619
1620 // This is an abstract access, but we need to specify a type and size.
1621 Info.memVT = MVT::i32;
1622 Info.size = 4;
1623 Info.align = Align(4);
1624
1625 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1626 Info.flags |= MachineMemOperand::MOLoad;
1627 else
1628 Info.flags |= MachineMemOperand::MOStore;
1629 return true;
1630 }
1631 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1632 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1633 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1634 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1635 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1636 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1637 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1638 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1639 Info.opc = ISD::INTRINSIC_VOID;
1640 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1641 Info.ptrVal = CI.getArgOperand(1);
1643 return true;
1644 }
1645 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1646 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1647 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1648 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1649 Info.opc = ISD::INTRINSIC_VOID;
1650 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1651 Info.ptrVal = CI.getArgOperand(0);
1653 return true;
1654 }
1655 case Intrinsic::amdgcn_load_to_lds:
1656 case Intrinsic::amdgcn_global_load_lds: {
1657 Info.opc = ISD::INTRINSIC_VOID;
1658 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1659 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1660 Info.ptrVal = CI.getArgOperand(1);
1662 return true;
1663 }
1664 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1665 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1666 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1667 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1668 Info.opc = ISD::INTRINSIC_W_CHAIN;
1669
1670 const GCNTargetMachine &TM =
1671 static_cast<const GCNTargetMachine &>(getTargetMachine());
1672
1674 Info.ptrVal = MFI->getGWSPSV(TM);
1675
1676 // This is an abstract access, but we need to specify a type and size.
1677 Info.memVT = MVT::i32;
1678 Info.size = 4;
1679 Info.align = Align(4);
1680
1682 return true;
1683 }
1684 case Intrinsic::amdgcn_s_prefetch_data:
1685 case Intrinsic::amdgcn_flat_prefetch:
1686 case Intrinsic::amdgcn_global_prefetch: {
1687 Info.opc = ISD::INTRINSIC_VOID;
1688 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1689 Info.ptrVal = CI.getArgOperand(0);
1690 Info.flags |= MachineMemOperand::MOLoad;
1691 return true;
1692 }
1693 default:
1694 return false;
1695 }
1696}
1697
1699 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1701 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1702 // The DAG's ValueType loses the addrspaces.
1703 // Add them as 2 extra Constant operands "from" and "to".
1704 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1705 unsigned DstAS = I.getType()->getPointerAddressSpace();
1706 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1707 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1708 break;
1709 }
1710 default:
1711 break;
1712 }
1713}
1714
1717 Type *&AccessTy) const {
1718 Value *Ptr = nullptr;
1719 switch (II->getIntrinsicID()) {
1720 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1721 case Intrinsic::amdgcn_cluster_load_b128:
1722 case Intrinsic::amdgcn_cluster_load_b64:
1723 case Intrinsic::amdgcn_cluster_load_b32:
1724 case Intrinsic::amdgcn_ds_append:
1725 case Intrinsic::amdgcn_ds_consume:
1726 case Intrinsic::amdgcn_ds_load_tr8_b64:
1727 case Intrinsic::amdgcn_ds_load_tr16_b128:
1728 case Intrinsic::amdgcn_ds_load_tr4_b64:
1729 case Intrinsic::amdgcn_ds_load_tr6_b96:
1730 case Intrinsic::amdgcn_ds_read_tr4_b64:
1731 case Intrinsic::amdgcn_ds_read_tr6_b96:
1732 case Intrinsic::amdgcn_ds_read_tr8_b64:
1733 case Intrinsic::amdgcn_ds_read_tr16_b64:
1734 case Intrinsic::amdgcn_ds_ordered_add:
1735 case Intrinsic::amdgcn_ds_ordered_swap:
1736 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1737 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1738 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1739 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1740 case Intrinsic::amdgcn_flat_load_monitor_b128:
1741 case Intrinsic::amdgcn_flat_load_monitor_b32:
1742 case Intrinsic::amdgcn_flat_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_atomic_csub:
1744 case Intrinsic::amdgcn_global_atomic_fmax_num:
1745 case Intrinsic::amdgcn_global_atomic_fmin_num:
1746 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1747 case Intrinsic::amdgcn_global_load_monitor_b128:
1748 case Intrinsic::amdgcn_global_load_monitor_b32:
1749 case Intrinsic::amdgcn_global_load_monitor_b64:
1750 case Intrinsic::amdgcn_global_load_tr_b64:
1751 case Intrinsic::amdgcn_global_load_tr_b128:
1752 case Intrinsic::amdgcn_global_load_tr4_b64:
1753 case Intrinsic::amdgcn_global_load_tr6_b96:
1754 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1755 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1756 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1757 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1758 Ptr = II->getArgOperand(0);
1759 break;
1760 case Intrinsic::amdgcn_load_to_lds:
1761 case Intrinsic::amdgcn_global_load_lds:
1762 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1763 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1764 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1765 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1766 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1767 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1768 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1769 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1770 Ptr = II->getArgOperand(1);
1771 break;
1772 default:
1773 return false;
1774 }
1775 AccessTy = II->getType();
1776 Ops.push_back(Ptr);
1777 return true;
1778}
1779
1781 unsigned AddrSpace) const {
1782 if (!Subtarget->hasFlatInstOffsets()) {
1783 // Flat instructions do not have offsets, and only have the register
1784 // address.
1785 return AM.BaseOffs == 0 && AM.Scale == 0;
1786 }
1787
1788 decltype(SIInstrFlags::FLAT) FlatVariant =
1792
1793 return AM.Scale == 0 &&
1794 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1795 AM.BaseOffs, AddrSpace, FlatVariant));
1796}
1797
1799 if (Subtarget->hasFlatGlobalInsts())
1801
1802 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1803 // Assume the we will use FLAT for all global memory accesses
1804 // on VI.
1805 // FIXME: This assumption is currently wrong. On VI we still use
1806 // MUBUF instructions for the r + i addressing mode. As currently
1807 // implemented, the MUBUF instructions only work on buffer < 4GB.
1808 // It may be possible to support > 4GB buffers with MUBUF instructions,
1809 // by setting the stride value in the resource descriptor which would
1810 // increase the size limit to (stride * 4GB). However, this is risky,
1811 // because it has never been validated.
1813 }
1814
1815 return isLegalMUBUFAddressingMode(AM);
1816}
1817
1818bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1819 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1820 // additionally can do r + r + i with addr64. 32-bit has more addressing
1821 // mode options. Depending on the resource constant, it can also do
1822 // (i64 r0) + (i32 r1) * (i14 i).
1823 //
1824 // Private arrays end up using a scratch buffer most of the time, so also
1825 // assume those use MUBUF instructions. Scratch loads / stores are currently
1826 // implemented as mubuf instructions with offen bit set, so slightly
1827 // different than the normal addr64.
1828 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1829 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1830 return false;
1831
1832 // FIXME: Since we can split immediate into soffset and immediate offset,
1833 // would it make sense to allow any immediate?
1834
1835 switch (AM.Scale) {
1836 case 0: // r + i or just i, depending on HasBaseReg.
1837 return true;
1838 case 1:
1839 return true; // We have r + r or r + i.
1840 case 2:
1841 if (AM.HasBaseReg) {
1842 // Reject 2 * r + r.
1843 return false;
1844 }
1845
1846 // Allow 2 * r as r + r
1847 // Or 2 * r + i is allowed as r + r + i.
1848 return true;
1849 default: // Don't allow n * r
1850 return false;
1851 }
1852}
1853
1855 const AddrMode &AM, Type *Ty,
1856 unsigned AS,
1857 Instruction *I) const {
1858 // No global is ever allowed as a base.
1859 if (AM.BaseGV)
1860 return false;
1861
1862 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1863 return isLegalGlobalAddressingMode(AM);
1864
1865 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1869 // If the offset isn't a multiple of 4, it probably isn't going to be
1870 // correctly aligned.
1871 // FIXME: Can we get the real alignment here?
1872 if (AM.BaseOffs % 4 != 0)
1873 return isLegalMUBUFAddressingMode(AM);
1874
1875 if (!Subtarget->hasScalarSubwordLoads()) {
1876 // There are no SMRD extloads, so if we have to do a small type access we
1877 // will use a MUBUF load.
1878 // FIXME?: We also need to do this if unaligned, but we don't know the
1879 // alignment here.
1880 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1881 return isLegalGlobalAddressingMode(AM);
1882 }
1883
1884 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1885 // SMRD instructions have an 8-bit, dword offset on SI.
1886 if (!isUInt<8>(AM.BaseOffs / 4))
1887 return false;
1888 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1889 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1890 // in 8-bits, it can use a smaller encoding.
1891 if (!isUInt<32>(AM.BaseOffs / 4))
1892 return false;
1893 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1894 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1895 if (!isUInt<20>(AM.BaseOffs))
1896 return false;
1897 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1898 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1899 // for S_BUFFER_* instructions).
1900 if (!isInt<21>(AM.BaseOffs))
1901 return false;
1902 } else {
1903 // On GFX12, all offsets are signed 24-bit in bytes.
1904 if (!isInt<24>(AM.BaseOffs))
1905 return false;
1906 }
1907
1908 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1910 AM.BaseOffs < 0) {
1911 // Scalar (non-buffer) loads can only use a negative offset if
1912 // soffset+offset is non-negative. Since the compiler can only prove that
1913 // in a few special cases, it is safer to claim that negative offsets are
1914 // not supported.
1915 return false;
1916 }
1917
1918 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1919 return true;
1920
1921 if (AM.Scale == 1 && AM.HasBaseReg)
1922 return true;
1923
1924 return false;
1925 }
1926
1927 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1928 return Subtarget->enableFlatScratch()
1930 : isLegalMUBUFAddressingMode(AM);
1931
1932 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1933 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1934 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1935 // field.
1936 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1937 // an 8-bit dword offset but we don't know the alignment here.
1938 if (!isUInt<16>(AM.BaseOffs))
1939 return false;
1940
1941 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1942 return true;
1943
1944 if (AM.Scale == 1 && AM.HasBaseReg)
1945 return true;
1946
1947 return false;
1948 }
1949
1951 // For an unknown address space, this usually means that this is for some
1952 // reason being used for pure arithmetic, and not based on some addressing
1953 // computation. We don't have instructions that compute pointers with any
1954 // addressing modes, so treat them as having no offset like flat
1955 // instructions.
1957 }
1958
1959 // Assume a user alias of global for unknown address spaces.
1960 return isLegalGlobalAddressingMode(AM);
1961}
1962
1964 const MachineFunction &MF) const {
1966 return (MemVT.getSizeInBits() <= 4 * 32);
1967 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1968 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1969 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1970 }
1972 return (MemVT.getSizeInBits() <= 2 * 32);
1973 return true;
1974}
1975
1977 unsigned Size, unsigned AddrSpace, Align Alignment,
1978 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1979 if (IsFast)
1980 *IsFast = 0;
1981
1982 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1983 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1984 // Check if alignment requirements for ds_read/write instructions are
1985 // disabled.
1986 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1987 return false;
1988
1989 Align RequiredAlignment(
1990 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1991 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1992 Alignment < RequiredAlignment)
1993 return false;
1994
1995 // Either, the alignment requirements are "enabled", or there is an
1996 // unaligned LDS access related hardware bug though alignment requirements
1997 // are "disabled". In either case, we need to check for proper alignment
1998 // requirements.
1999 //
2000 switch (Size) {
2001 case 64:
2002 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2003 // address is negative, then the instruction is incorrectly treated as
2004 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2005 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2006 // load later in the SILoadStoreOptimizer.
2007 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2008 return false;
2009
2010 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2011 // can do a 4 byte aligned, 8 byte access in a single operation using
2012 // ds_read2/write2_b32 with adjacent offsets.
2013 RequiredAlignment = Align(4);
2014
2015 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2016 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2017 // ds_write2_b32 depending on the alignment. In either case with either
2018 // alignment there is no faster way of doing this.
2019
2020 // The numbers returned here and below are not additive, it is a 'speed
2021 // rank'. They are just meant to be compared to decide if a certain way
2022 // of lowering an operation is faster than another. For that purpose
2023 // naturally aligned operation gets it bitsize to indicate that "it
2024 // operates with a speed comparable to N-bit wide load". With the full
2025 // alignment ds128 is slower than ds96 for example. If underaligned it
2026 // is comparable to a speed of a single dword access, which would then
2027 // mean 32 < 128 and it is faster to issue a wide load regardless.
2028 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2029 // wider load which will not be aligned anymore the latter is slower.
2030 if (IsFast)
2031 *IsFast = (Alignment >= RequiredAlignment) ? 64
2032 : (Alignment < Align(4)) ? 32
2033 : 1;
2034 return true;
2035 }
2036
2037 break;
2038 case 96:
2039 if (!Subtarget->hasDS96AndDS128())
2040 return false;
2041
2042 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2043 // gfx8 and older.
2044
2045 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2046 // Naturally aligned access is fastest. However, also report it is Fast
2047 // if memory is aligned less than DWORD. A narrow load or store will be
2048 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2049 // be more of them, so overall we will pay less penalty issuing a single
2050 // instruction.
2051
2052 // See comment on the values above.
2053 if (IsFast)
2054 *IsFast = (Alignment >= RequiredAlignment) ? 96
2055 : (Alignment < Align(4)) ? 32
2056 : 1;
2057 return true;
2058 }
2059
2060 break;
2061 case 128:
2062 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2063 return false;
2064
2065 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2066 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2067 // single operation using ds_read2/write2_b64.
2068 RequiredAlignment = Align(8);
2069
2070 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2071 // Naturally aligned access is fastest. However, also report it is Fast
2072 // if memory is aligned less than DWORD. A narrow load or store will be
2073 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2074 // will be more of them, so overall we will pay less penalty issuing a
2075 // single instruction.
2076
2077 // See comment on the values above.
2078 if (IsFast)
2079 *IsFast = (Alignment >= RequiredAlignment) ? 128
2080 : (Alignment < Align(4)) ? 32
2081 : 1;
2082 return true;
2083 }
2084
2085 break;
2086 default:
2087 if (Size > 32)
2088 return false;
2089
2090 break;
2091 }
2092
2093 // See comment on the values above.
2094 // Note that we have a single-dword or sub-dword here, so if underaligned
2095 // it is a slowest possible access, hence returned value is 0.
2096 if (IsFast)
2097 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2098
2099 return Alignment >= RequiredAlignment ||
2100 Subtarget->hasUnalignedDSAccessEnabled();
2101 }
2102
2103 // FIXME: We have to be conservative here and assume that flat operations
2104 // will access scratch. If we had access to the IR function, then we
2105 // could determine if any private memory was used in the function.
2106 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2107 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2108 bool AlignedBy4 = Alignment >= Align(4);
2109 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2110 if (IsFast)
2111 *IsFast = AlignedBy4 ? Size : 1;
2112 return true;
2113 }
2114
2115 if (IsFast)
2116 *IsFast = AlignedBy4;
2117
2118 return AlignedBy4;
2119 }
2120
2121 // So long as they are correct, wide global memory operations perform better
2122 // than multiple smaller memory ops -- even when misaligned
2123 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2124 if (IsFast)
2125 *IsFast = Size;
2126
2127 return Alignment >= Align(4) ||
2128 Subtarget->hasUnalignedBufferAccessEnabled();
2129 }
2130
2131 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2132 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2133 // out-of-bounds behavior, but in the edge case where an access starts
2134 // out-of-bounds and then enter in-bounds, the entire access would be treated
2135 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2136 // natural alignment of buffer accesses.
2137 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2138 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2139 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2140 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2141 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2142 return false;
2143 }
2144
2145 // Smaller than dword value must be aligned.
2146 if (Size < 32)
2147 return false;
2148
2149 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2150 // byte-address are ignored, thus forcing Dword alignment.
2151 // This applies to private, global, and constant memory.
2152 if (IsFast)
2153 *IsFast = 1;
2154
2155 return Size >= 32 && Alignment >= Align(4);
2156}
2157
2159 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2160 unsigned *IsFast) const {
2162 Alignment, Flags, IsFast);
2163}
2164
2166 LLVMContext &Context, const MemOp &Op,
2167 const AttributeList &FuncAttributes) const {
2168 // FIXME: Should account for address space here.
2169
2170 // The default fallback uses the private pointer size as a guess for a type to
2171 // use. Make sure we switch these to 64-bit accesses.
2172
2173 if (Op.size() >= 16 &&
2174 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2175 return MVT::v4i32;
2176
2177 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2178 return MVT::v2i32;
2179
2180 // Use the default.
2181 return MVT::Other;
2182}
2183
2185 const MemSDNode *MemNode = cast<MemSDNode>(N);
2186 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2187}
2188
2193
2195 unsigned DestAS) const {
2196 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2197 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2198 Subtarget->hasGloballyAddressableScratch()) {
2199 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2200 return false;
2201 }
2202
2203 // Flat -> private/local is a simple truncate.
2204 // Flat -> global is no-op
2205 return true;
2206 }
2207
2208 const GCNTargetMachine &TM =
2209 static_cast<const GCNTargetMachine &>(getTargetMachine());
2210 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2211}
2212
2220
2222 Type *Ty) const {
2223 // FIXME: Could be smarter if called for vector constants.
2224 return true;
2225}
2226
2228 unsigned Index) const {
2230 return false;
2231
2232 // TODO: Add more cases that are cheap.
2233 return Index == 0;
2234}
2235
2236bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2237 // TODO: This should be more aggressive, particular for 16-bit element
2238 // vectors. However there are some mixed improvements and regressions.
2239 EVT EltTy = VT.getVectorElementType();
2240 return EltTy.getSizeInBits() % 32 == 0;
2241}
2242
2244 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2245 switch (Op) {
2246 case ISD::LOAD:
2247 case ISD::STORE:
2248 return true;
2249 default:
2250 return false;
2251 }
2252 }
2253
2254 // SimplifySetCC uses this function to determine whether or not it should
2255 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2256 if (VT == MVT::i1 && Op == ISD::SETCC)
2257 return false;
2258
2260}
2261
2262SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2263 const SDLoc &SL,
2264 SDValue Chain,
2265 uint64_t Offset) const {
2266 const DataLayout &DL = DAG.getDataLayout();
2270
2271 auto [InputPtrReg, RC, ArgTy] =
2272 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2273
2274 // We may not have the kernarg segment argument if we have no kernel
2275 // arguments.
2276 if (!InputPtrReg)
2277 return DAG.getConstant(Offset, SL, PtrVT);
2278
2280 SDValue BasePtr = DAG.getCopyFromReg(
2281 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2282
2283 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2284}
2285
2286SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2287 const SDLoc &SL) const {
2290 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2291}
2292
2293SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2294 const SDLoc &SL) const {
2295
2297 std::optional<uint32_t> KnownSize =
2299 if (KnownSize.has_value())
2300 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2301 return SDValue();
2302}
2303
2304SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2305 const SDLoc &SL, SDValue Val,
2306 bool Signed,
2307 const ISD::InputArg *Arg) const {
2308 // First, if it is a widened vector, narrow it.
2309 if (VT.isVector() &&
2311 EVT NarrowedVT =
2314 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2315 DAG.getConstant(0, SL, MVT::i32));
2316 }
2317
2318 // Then convert the vector elements or scalar value.
2319 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2320 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2321 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2322 }
2323
2324 if (MemVT.isFloatingPoint())
2325 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2326 else if (Signed)
2327 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2328 else
2329 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2330
2331 return Val;
2332}
2333
2334SDValue SITargetLowering::lowerKernargMemParameter(
2335 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2336 uint64_t Offset, Align Alignment, bool Signed,
2337 const ISD::InputArg *Arg) const {
2338 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2339
2340 // Try to avoid using an extload by loading earlier than the argument address,
2341 // and extracting the relevant bits. The load should hopefully be merged with
2342 // the previous argument.
2343 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2344 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2345 int64_t AlignDownOffset = alignDown(Offset, 4);
2346 int64_t OffsetDiff = Offset - AlignDownOffset;
2347
2348 EVT IntVT = MemVT.changeTypeToInteger();
2349
2350 // TODO: If we passed in the base kernel offset we could have a better
2351 // alignment than 4, but we don't really need it.
2352 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2353 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2356
2357 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2358 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2359
2360 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2361 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2362 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2363
2364 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2365 }
2366
2367 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2368 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2371
2372 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2373 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2374}
2375
2376/// Coerce an argument which was passed in a different ABI type to the original
2377/// expected value type.
2378SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2379 SDValue Val,
2380 CCValAssign &VA,
2381 const SDLoc &SL) const {
2382 EVT ValVT = VA.getValVT();
2383
2384 // If this is an 8 or 16-bit value, it is really passed promoted
2385 // to 32 bits. Insert an assert[sz]ext to capture this, then
2386 // truncate to the right size.
2387 switch (VA.getLocInfo()) {
2388 case CCValAssign::Full:
2389 return Val;
2390 case CCValAssign::BCvt:
2391 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2392 case CCValAssign::SExt:
2393 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2394 DAG.getValueType(ValVT));
2395 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2396 case CCValAssign::ZExt:
2397 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2398 DAG.getValueType(ValVT));
2399 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2400 case CCValAssign::AExt:
2401 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2402 default:
2403 llvm_unreachable("Unknown loc info!");
2404 }
2405}
2406
2407SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2408 CCValAssign &VA, const SDLoc &SL,
2409 SDValue Chain,
2410 const ISD::InputArg &Arg) const {
2411 MachineFunction &MF = DAG.getMachineFunction();
2412 MachineFrameInfo &MFI = MF.getFrameInfo();
2413
2414 if (Arg.Flags.isByVal()) {
2415 unsigned Size = Arg.Flags.getByValSize();
2416 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2417 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2418 }
2419
2420 unsigned ArgOffset = VA.getLocMemOffset();
2421 unsigned ArgSize = VA.getValVT().getStoreSize();
2422
2423 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2424
2425 // Create load nodes to retrieve arguments from the stack.
2426 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2427
2428 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2430 MVT MemVT = VA.getValVT();
2431
2432 switch (VA.getLocInfo()) {
2433 default:
2434 break;
2435 case CCValAssign::BCvt:
2436 MemVT = VA.getLocVT();
2437 break;
2438 case CCValAssign::SExt:
2439 ExtType = ISD::SEXTLOAD;
2440 break;
2441 case CCValAssign::ZExt:
2442 ExtType = ISD::ZEXTLOAD;
2443 break;
2444 case CCValAssign::AExt:
2445 ExtType = ISD::EXTLOAD;
2446 break;
2447 }
2448
2449 SDValue ArgValue = DAG.getExtLoad(
2450 ExtType, SL, VA.getLocVT(), Chain, FIN,
2452
2453 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2454 if (ConvertedVal == ArgValue)
2455 return ConvertedVal;
2456
2457 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2458}
2459
2460SDValue SITargetLowering::lowerWorkGroupId(
2461 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2464 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2465 if (!Subtarget->hasClusters())
2466 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2467
2468 // Clusters are supported. Return the global position in the grid. If clusters
2469 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2470
2471 // WorkGroupIdXYZ = ClusterId == 0 ?
2472 // ClusterIdXYZ :
2473 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2474 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2475 SDLoc SL(ClusterIdXYZ);
2476 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2477 SDValue One = DAG.getConstant(1, SL, VT);
2478 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2479 SDValue ClusterWorkGroupIdXYZ =
2480 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2481 SDValue GlobalIdXYZ =
2482 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2483 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2484
2485 switch (MFI.getClusterDims().getKind()) {
2488 return GlobalIdXYZ;
2490 return ClusterIdXYZ;
2492 using namespace AMDGPU::Hwreg;
2493 SDValue ClusterIdField =
2494 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2495 SDNode *GetReg =
2496 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2497 SDValue ClusterId(GetReg, 0);
2498 SDValue Zero = DAG.getConstant(0, SL, VT);
2499 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2500 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2501 }
2502 }
2503
2504 llvm_unreachable("nothing should reach here");
2505}
2506
2507SDValue SITargetLowering::getPreloadedValue(
2508 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2510 const ArgDescriptor *Reg = nullptr;
2511 const TargetRegisterClass *RC;
2512 LLT Ty;
2513
2515 const ArgDescriptor WorkGroupIDX =
2516 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2517 // If GridZ is not programmed in an entry function then the hardware will set
2518 // it to all zeros, so there is no need to mask the GridY value in the low
2519 // order bits.
2520 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2521 AMDGPU::TTMP7,
2522 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2523 const ArgDescriptor WorkGroupIDZ =
2524 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2525 const ArgDescriptor ClusterWorkGroupIDX =
2526 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2527 const ArgDescriptor ClusterWorkGroupIDY =
2528 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2529 const ArgDescriptor ClusterWorkGroupIDZ =
2530 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2531 const ArgDescriptor ClusterWorkGroupMaxIDX =
2532 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2533 const ArgDescriptor ClusterWorkGroupMaxIDY =
2534 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2535 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2536 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2537 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2538 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2539
2540 auto LoadConstant = [&](unsigned N) {
2541 return DAG.getConstant(N, SDLoc(), VT);
2542 };
2543
2544 if (Subtarget->hasArchitectedSGPRs() &&
2546 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2547 bool HasFixedDims = ClusterDims.isFixedDims();
2548
2549 switch (PVID) {
2551 Reg = &WorkGroupIDX;
2552 RC = &AMDGPU::SReg_32RegClass;
2553 Ty = LLT::scalar(32);
2554 break;
2556 Reg = &WorkGroupIDY;
2557 RC = &AMDGPU::SReg_32RegClass;
2558 Ty = LLT::scalar(32);
2559 break;
2561 Reg = &WorkGroupIDZ;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDX;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDY;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2581 return LoadConstant(0);
2582 Reg = &ClusterWorkGroupIDZ;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[0] - 1);
2589 Reg = &ClusterWorkGroupMaxIDX;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 if (HasFixedDims)
2595 return LoadConstant(ClusterDims.getDims()[1] - 1);
2596 Reg = &ClusterWorkGroupMaxIDY;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(32);
2599 break;
2601 if (HasFixedDims)
2602 return LoadConstant(ClusterDims.getDims()[2] - 1);
2603 Reg = &ClusterWorkGroupMaxIDZ;
2604 RC = &AMDGPU::SReg_32RegClass;
2605 Ty = LLT::scalar(32);
2606 break;
2608 Reg = &ClusterWorkGroupMaxFlatID;
2609 RC = &AMDGPU::SReg_32RegClass;
2610 Ty = LLT::scalar(32);
2611 break;
2612 default:
2613 break;
2614 }
2615 }
2616
2617 if (!Reg)
2618 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2619 if (!Reg) {
2621 // It's possible for a kernarg intrinsic call to appear in a kernel with
2622 // no allocated segment, in which case we do not add the user sgpr
2623 // argument, so just return null.
2624 return DAG.getConstant(0, SDLoc(), VT);
2625 }
2626
2627 // It's undefined behavior if a function marked with the amdgpu-no-*
2628 // attributes uses the corresponding intrinsic.
2629 return DAG.getPOISON(VT);
2630 }
2631
2632 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2633}
2634
2636 CallingConv::ID CallConv,
2637 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2638 FunctionType *FType,
2640 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2641 const ISD::InputArg *Arg = &Ins[I];
2642
2643 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2644 "vector type argument should have been split");
2645
2646 // First check if it's a PS input addr.
2647 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2648 PSInputNum <= 15) {
2649 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2650
2651 // Inconveniently only the first part of the split is marked as isSplit,
2652 // so skip to the end. We only want to increment PSInputNum once for the
2653 // entire split argument.
2654 if (Arg->Flags.isSplit()) {
2655 while (!Arg->Flags.isSplitEnd()) {
2656 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2657 "unexpected vector split in ps argument type");
2658 if (!SkipArg)
2659 Splits.push_back(*Arg);
2660 Arg = &Ins[++I];
2661 }
2662 }
2663
2664 if (SkipArg) {
2665 // We can safely skip PS inputs.
2666 Skipped.set(Arg->getOrigArgIndex());
2667 ++PSInputNum;
2668 continue;
2669 }
2670
2671 Info->markPSInputAllocated(PSInputNum);
2672 if (Arg->Used)
2673 Info->markPSInputEnabled(PSInputNum);
2674
2675 ++PSInputNum;
2676 }
2677
2678 Splits.push_back(*Arg);
2679 }
2680}
2681
2682// Allocate special inputs passed in VGPRs.
2684 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2685 SIMachineFunctionInfo &Info) const {
2686 const LLT S32 = LLT::scalar(32);
2688
2689 if (Info.hasWorkItemIDX()) {
2690 Register Reg = AMDGPU::VGPR0;
2691 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2692
2693 CCInfo.AllocateReg(Reg);
2694 unsigned Mask =
2695 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2696 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2697 }
2698
2699 if (Info.hasWorkItemIDY()) {
2700 assert(Info.hasWorkItemIDX());
2701 if (Subtarget->hasPackedTID()) {
2702 Info.setWorkItemIDY(
2703 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2704 } else {
2705 unsigned Reg = AMDGPU::VGPR1;
2706 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2707
2708 CCInfo.AllocateReg(Reg);
2709 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2710 }
2711 }
2712
2713 if (Info.hasWorkItemIDZ()) {
2714 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2715 if (Subtarget->hasPackedTID()) {
2716 Info.setWorkItemIDZ(
2717 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2718 } else {
2719 unsigned Reg = AMDGPU::VGPR2;
2720 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2721
2722 CCInfo.AllocateReg(Reg);
2723 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2724 }
2725 }
2726}
2727
2728// Try to allocate a VGPR at the end of the argument list, or if no argument
2729// VGPRs are left allocating a stack slot.
2730// If \p Mask is is given it indicates bitfield position in the register.
2731// If \p Arg is given use it with new ]p Mask instead of allocating new.
2732static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2733 ArgDescriptor Arg = ArgDescriptor()) {
2734 if (Arg.isSet())
2735 return ArgDescriptor::createArg(Arg, Mask);
2736
2737 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2738 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2739 if (RegIdx == ArgVGPRs.size()) {
2740 // Spill to stack required.
2741 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2742
2743 return ArgDescriptor::createStack(Offset, Mask);
2744 }
2745
2746 unsigned Reg = ArgVGPRs[RegIdx];
2747 Reg = CCInfo.AllocateReg(Reg);
2748 assert(Reg != AMDGPU::NoRegister);
2749
2750 MachineFunction &MF = CCInfo.getMachineFunction();
2751 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2752 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2753 return ArgDescriptor::createRegister(Reg, Mask);
2754}
2755
2757 const TargetRegisterClass *RC,
2758 unsigned NumArgRegs) {
2759 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2760 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2761 if (RegIdx == ArgSGPRs.size())
2762 report_fatal_error("ran out of SGPRs for arguments");
2763
2764 unsigned Reg = ArgSGPRs[RegIdx];
2765 Reg = CCInfo.AllocateReg(Reg);
2766 assert(Reg != AMDGPU::NoRegister);
2767
2768 MachineFunction &MF = CCInfo.getMachineFunction();
2769 MF.addLiveIn(Reg, RC);
2771}
2772
2773// If this has a fixed position, we still should allocate the register in the
2774// CCInfo state. Technically we could get away with this for values passed
2775// outside of the normal argument range.
2777 const TargetRegisterClass *RC,
2778 MCRegister Reg) {
2779 Reg = CCInfo.AllocateReg(Reg);
2780 assert(Reg != AMDGPU::NoRegister);
2781 MachineFunction &MF = CCInfo.getMachineFunction();
2782 MF.addLiveIn(Reg, RC);
2783}
2784
2785static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2786 if (Arg) {
2787 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2788 Arg.getRegister());
2789 } else
2790 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2791}
2792
2793static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2794 if (Arg) {
2795 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2796 Arg.getRegister());
2797 } else
2798 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2799}
2800
2801/// Allocate implicit function VGPR arguments at the end of allocated user
2802/// arguments.
2804 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2805 SIMachineFunctionInfo &Info) const {
2806 const unsigned Mask = 0x3ff;
2807 ArgDescriptor Arg;
2808
2809 if (Info.hasWorkItemIDX()) {
2810 Arg = allocateVGPR32Input(CCInfo, Mask);
2811 Info.setWorkItemIDX(Arg);
2812 }
2813
2814 if (Info.hasWorkItemIDY()) {
2815 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2816 Info.setWorkItemIDY(Arg);
2817 }
2818
2819 if (Info.hasWorkItemIDZ())
2820 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2821}
2822
2823/// Allocate implicit function VGPR arguments in fixed registers.
2825 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2826 SIMachineFunctionInfo &Info) const {
2827 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2828 if (!Reg)
2829 report_fatal_error("failed to allocate VGPR for implicit arguments");
2830
2831 const unsigned Mask = 0x3ff;
2832 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2833 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2834 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2835}
2836
2838 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2839 SIMachineFunctionInfo &Info) const {
2840 auto &ArgInfo = Info.getArgInfo();
2841 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2842
2843 // TODO: Unify handling with private memory pointers.
2844 if (UserSGPRInfo.hasDispatchPtr())
2845 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2846
2847 if (UserSGPRInfo.hasQueuePtr())
2848 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2849
2850 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2851 // constant offset from the kernarg segment.
2852 if (Info.hasImplicitArgPtr())
2853 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2854
2855 if (UserSGPRInfo.hasDispatchID())
2856 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2857
2858 // flat_scratch_init is not applicable for non-kernel functions.
2859
2860 if (Info.hasWorkGroupIDX())
2861 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2862
2863 if (Info.hasWorkGroupIDY())
2864 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2865
2866 if (Info.hasWorkGroupIDZ())
2867 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2868
2869 if (Info.hasLDSKernelId())
2870 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2871}
2872
2873// Allocate special inputs passed in user SGPRs.
2875 MachineFunction &MF,
2876 const SIRegisterInfo &TRI,
2877 SIMachineFunctionInfo &Info) const {
2878 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2879 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2880 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2881 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2882 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2883 }
2884
2885 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2886 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2887 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2888 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2889 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2890 }
2891
2892 if (UserSGPRInfo.hasDispatchPtr()) {
2893 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2894 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2895 CCInfo.AllocateReg(DispatchPtrReg);
2896 }
2897
2898 if (UserSGPRInfo.hasQueuePtr()) {
2899 Register QueuePtrReg = Info.addQueuePtr(TRI);
2900 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2901 CCInfo.AllocateReg(QueuePtrReg);
2902 }
2903
2904 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2906 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2907 CCInfo.AllocateReg(InputPtrReg);
2908
2909 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2910 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2911 }
2912
2913 if (UserSGPRInfo.hasDispatchID()) {
2914 Register DispatchIDReg = Info.addDispatchID(TRI);
2915 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2916 CCInfo.AllocateReg(DispatchIDReg);
2917 }
2918
2919 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2920 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2921 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2922 CCInfo.AllocateReg(FlatScratchInitReg);
2923 }
2924
2925 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2926 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2927 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2928 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2929 }
2930
2931 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2932 // these from the dispatch pointer.
2933}
2934
2935// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2936// sequential starting from the first argument.
2938 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2940 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2941 Function &F = MF.getFunction();
2942 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2943 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2944 bool InPreloadSequence = true;
2945 unsigned InIdx = 0;
2946 bool AlignedForImplictArgs = false;
2947 unsigned ImplicitArgOffset = 0;
2948 for (auto &Arg : F.args()) {
2949 if (!InPreloadSequence || !Arg.hasInRegAttr())
2950 break;
2951
2952 unsigned ArgIdx = Arg.getArgNo();
2953 // Don't preload non-original args or parts not in the current preload
2954 // sequence.
2955 if (InIdx < Ins.size() &&
2956 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2957 break;
2958
2959 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2960 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2961 InIdx++) {
2962 assert(ArgLocs[ArgIdx].isMemLoc());
2963 auto &ArgLoc = ArgLocs[InIdx];
2964 const Align KernelArgBaseAlign = Align(16);
2965 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2966 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2967 unsigned NumAllocSGPRs =
2968 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2969
2970 // Fix alignment for hidden arguments.
2971 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2972 if (!AlignedForImplictArgs) {
2973 ImplicitArgOffset =
2974 alignTo(LastExplicitArgOffset,
2975 Subtarget->getAlignmentForImplicitArgPtr()) -
2976 LastExplicitArgOffset;
2977 AlignedForImplictArgs = true;
2978 }
2979 ArgOffset += ImplicitArgOffset;
2980 }
2981
2982 // Arg is preloaded into the previous SGPR.
2983 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2984 assert(InIdx >= 1 && "No previous SGPR");
2985 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2986 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2987 continue;
2988 }
2989
2990 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2991 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2992 // Check for free user SGPRs for preloading.
2993 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2994 InPreloadSequence = false;
2995 break;
2996 }
2997
2998 // Preload this argument.
2999 const TargetRegisterClass *RC =
3000 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3001 SmallVectorImpl<MCRegister> *PreloadRegs =
3002 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3003
3004 if (PreloadRegs->size() > 1)
3005 RC = &AMDGPU::SGPR_32RegClass;
3006 for (auto &Reg : *PreloadRegs) {
3007 assert(Reg);
3008 MF.addLiveIn(Reg, RC);
3009 CCInfo.AllocateReg(Reg);
3010 }
3011
3012 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3013 }
3014 }
3015}
3016
3018 const SIRegisterInfo &TRI,
3019 SIMachineFunctionInfo &Info) const {
3020 // Always allocate this last since it is a synthetic preload.
3021 if (Info.hasLDSKernelId()) {
3022 Register Reg = Info.addLDSKernelId();
3023 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3024 CCInfo.AllocateReg(Reg);
3025 }
3026}
3027
3028// Allocate special input registers that are initialized per-wave.
3031 CallingConv::ID CallConv,
3032 bool IsShader) const {
3033 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3034 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3035 // Note: user SGPRs are handled by the front-end for graphics shaders
3036 // Pad up the used user SGPRs with dead inputs.
3037
3038 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3039 // before enabling architected SGPRs for workgroup IDs.
3040 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3041
3042 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3043 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3044 // rely on it to reach 16 since if we end up having no stack usage, it will
3045 // not really be added.
3046 unsigned NumRequiredSystemSGPRs =
3047 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3048 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3049 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3050 Register Reg = Info.addReservedUserSGPR();
3051 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3052 CCInfo.AllocateReg(Reg);
3053 }
3054 }
3055
3056 if (!HasArchitectedSGPRs) {
3057 if (Info.hasWorkGroupIDX()) {
3058 Register Reg = Info.addWorkGroupIDX();
3059 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3060 CCInfo.AllocateReg(Reg);
3061 }
3062
3063 if (Info.hasWorkGroupIDY()) {
3064 Register Reg = Info.addWorkGroupIDY();
3065 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3066 CCInfo.AllocateReg(Reg);
3067 }
3068
3069 if (Info.hasWorkGroupIDZ()) {
3070 Register Reg = Info.addWorkGroupIDZ();
3071 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 CCInfo.AllocateReg(Reg);
3073 }
3074 }
3075
3076 if (Info.hasWorkGroupInfo()) {
3077 Register Reg = Info.addWorkGroupInfo();
3078 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3079 CCInfo.AllocateReg(Reg);
3080 }
3081
3082 if (Info.hasPrivateSegmentWaveByteOffset()) {
3083 // Scratch wave offset passed in system SGPR.
3084 unsigned PrivateSegmentWaveByteOffsetReg;
3085
3086 if (IsShader) {
3087 PrivateSegmentWaveByteOffsetReg =
3088 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3089
3090 // This is true if the scratch wave byte offset doesn't have a fixed
3091 // location.
3092 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3093 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3094 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3095 }
3096 } else
3097 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3098
3099 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3100 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3101 }
3102
3103 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3104 Info.getNumPreloadedSGPRs() >= 16);
3105}
3106
3108 MachineFunction &MF,
3109 const SIRegisterInfo &TRI,
3111 // Now that we've figured out where the scratch register inputs are, see if
3112 // should reserve the arguments and use them directly.
3113 MachineFrameInfo &MFI = MF.getFrameInfo();
3114 bool HasStackObjects = MFI.hasStackObjects();
3115 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3116
3117 // Record that we know we have non-spill stack objects so we don't need to
3118 // check all stack objects later.
3119 if (HasStackObjects)
3120 Info.setHasNonSpillStackObjects(true);
3121
3122 // Everything live out of a block is spilled with fast regalloc, so it's
3123 // almost certain that spilling will be required.
3124 if (TM.getOptLevel() == CodeGenOptLevel::None)
3125 HasStackObjects = true;
3126
3127 // For now assume stack access is needed in any callee functions, so we need
3128 // the scratch registers to pass in.
3129 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3130
3131 if (!ST.enableFlatScratch()) {
3132 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3133 // If we have stack objects, we unquestionably need the private buffer
3134 // resource. For the Code Object V2 ABI, this will be the first 4 user
3135 // SGPR inputs. We can reserve those and use them directly.
3136
3137 Register PrivateSegmentBufferReg =
3139 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3140 } else {
3141 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3142 // We tentatively reserve the last registers (skipping the last registers
3143 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3144 // we'll replace these with the ones immediately after those which were
3145 // really allocated. In the prologue copies will be inserted from the
3146 // argument to these reserved registers.
3147
3148 // Without HSA, relocations are used for the scratch pointer and the
3149 // buffer resource setup is always inserted in the prologue. Scratch wave
3150 // offset is still in an input SGPR.
3151 Info.setScratchRSrcReg(ReservedBufferReg);
3152 }
3153 }
3154
3156
3157 // For entry functions we have to set up the stack pointer if we use it,
3158 // whereas non-entry functions get this "for free". This means there is no
3159 // intrinsic advantage to using S32 over S34 in cases where we do not have
3160 // calls but do need a frame pointer (i.e. if we are requested to have one
3161 // because frame pointer elimination is disabled). To keep things simple we
3162 // only ever use S32 as the call ABI stack pointer, and so using it does not
3163 // imply we need a separate frame pointer.
3164 //
3165 // Try to use s32 as the SP, but move it if it would interfere with input
3166 // arguments. This won't work with calls though.
3167 //
3168 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3169 // registers.
3170 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3171 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3172 } else {
3174
3175 if (MFI.hasCalls())
3176 report_fatal_error("call in graphics shader with too many input SGPRs");
3177
3178 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3179 if (!MRI.isLiveIn(Reg)) {
3180 Info.setStackPtrOffsetReg(Reg);
3181 break;
3182 }
3183 }
3184
3185 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3186 report_fatal_error("failed to find register for SP");
3187 }
3188
3189 // hasFP should be accurate for entry functions even before the frame is
3190 // finalized, because it does not rely on the known stack size, only
3191 // properties like whether variable sized objects are present.
3192 if (ST.getFrameLowering()->hasFP(MF)) {
3193 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3194 }
3195}
3196
3199 return !Info->isEntryFunction();
3200}
3201
3203
3205 MachineBasicBlock *Entry,
3206 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3208
3209 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3210 if (!IStart)
3211 return;
3212
3213 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3214 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3215 MachineBasicBlock::iterator MBBI = Entry->begin();
3216 for (const MCPhysReg *I = IStart; *I; ++I) {
3217 const TargetRegisterClass *RC = nullptr;
3218 if (AMDGPU::SReg_64RegClass.contains(*I))
3219 RC = &AMDGPU::SGPR_64RegClass;
3220 else if (AMDGPU::SReg_32RegClass.contains(*I))
3221 RC = &AMDGPU::SGPR_32RegClass;
3222 else
3223 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3224
3225 Register NewVR = MRI->createVirtualRegister(RC);
3226 // Create copy from CSR to a virtual register.
3227 Entry->addLiveIn(*I);
3228 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3229 .addReg(*I);
3230
3231 // Insert the copy-back instructions right before the terminator.
3232 for (auto *Exit : Exits)
3233 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3234 TII->get(TargetOpcode::COPY), *I)
3235 .addReg(NewVR);
3236 }
3237}
3238
3240 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3241 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3242 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3244
3246 const Function &Fn = MF.getFunction();
3249 bool IsError = false;
3250
3251 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3253 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3254 IsError = true;
3255 }
3256
3259 BitVector Skipped(Ins.size());
3260 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3261 *DAG.getContext());
3262
3263 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3264 bool IsKernel = AMDGPU::isKernel(CallConv);
3265 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3266
3267 if (IsGraphics) {
3268 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3269 assert(!UserSGPRInfo.hasDispatchPtr() &&
3270 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3271 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3272 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3273 (void)UserSGPRInfo;
3274 if (!Subtarget->enableFlatScratch())
3275 assert(!UserSGPRInfo.hasFlatScratchInit());
3276 if ((CallConv != CallingConv::AMDGPU_CS &&
3277 CallConv != CallingConv::AMDGPU_Gfx &&
3278 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3279 !Subtarget->hasArchitectedSGPRs())
3280 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3281 !Info->hasWorkGroupIDZ());
3282 }
3283
3284 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3285
3286 if (CallConv == CallingConv::AMDGPU_PS) {
3287 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3288
3289 // At least one interpolation mode must be enabled or else the GPU will
3290 // hang.
3291 //
3292 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3293 // set PSInputAddr, the user wants to enable some bits after the compilation
3294 // based on run-time states. Since we can't know what the final PSInputEna
3295 // will look like, so we shouldn't do anything here and the user should take
3296 // responsibility for the correct programming.
3297 //
3298 // Otherwise, the following restrictions apply:
3299 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3300 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3301 // enabled too.
3302 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3303 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3304 CCInfo.AllocateReg(AMDGPU::VGPR0);
3305 CCInfo.AllocateReg(AMDGPU::VGPR1);
3306 Info->markPSInputAllocated(0);
3307 Info->markPSInputEnabled(0);
3308 }
3309 if (Subtarget->isAmdPalOS()) {
3310 // For isAmdPalOS, the user does not enable some bits after compilation
3311 // based on run-time states; the register values being generated here are
3312 // the final ones set in hardware. Therefore we need to apply the
3313 // workaround to PSInputAddr and PSInputEnable together. (The case where
3314 // a bit is set in PSInputAddr but not PSInputEnable is where the
3315 // frontend set up an input arg for a particular interpolation mode, but
3316 // nothing uses that input arg. Really we should have an earlier pass
3317 // that removes such an arg.)
3318 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3319 if ((PsInputBits & 0x7F) == 0 ||
3320 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3321 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3322 }
3323 } else if (IsKernel) {
3324 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3325 } else {
3326 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3327 Ins.end());
3328 }
3329
3330 if (IsKernel)
3331 analyzeFormalArgumentsCompute(CCInfo, Ins);
3332
3333 if (IsEntryFunc) {
3334 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3335 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3336 if (IsKernel && Subtarget->hasKernargPreload())
3337 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3338
3339 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3340 } else if (!IsGraphics) {
3341 // For the fixed ABI, pass workitem IDs in the last argument register.
3342 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3343
3344 // FIXME: Sink this into allocateSpecialInputSGPRs
3345 if (!Subtarget->enableFlatScratch())
3346 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3347
3348 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3349 }
3350
3351 if (!IsKernel) {
3352 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3353 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3354
3355 // This assumes the registers are allocated by CCInfo in ascending order
3356 // with no gaps.
3357 Info->setNumWaveDispatchSGPRs(
3358 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3359 Info->setNumWaveDispatchVGPRs(
3360 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3361 } else if (Info->getNumKernargPreloadedSGPRs()) {
3362 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3363 }
3364
3366
3367 if (IsWholeWaveFunc) {
3369 {MVT::i1, MVT::Other}, Chain);
3370 InVals.push_back(Setup.getValue(0));
3371 Chains.push_back(Setup.getValue(1));
3372 }
3373
3374 // FIXME: This is the minimum kernel argument alignment. We should improve
3375 // this to the maximum alignment of the arguments.
3376 //
3377 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3378 // kern arg offset.
3379 const Align KernelArgBaseAlign = Align(16);
3380
3381 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3382 ++i) {
3383 const ISD::InputArg &Arg = Ins[i];
3384 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3385 InVals.push_back(DAG.getPOISON(Arg.VT));
3386 continue;
3387 }
3388
3389 CCValAssign &VA = ArgLocs[ArgIdx++];
3390 MVT VT = VA.getLocVT();
3391
3392 if (IsEntryFunc && VA.isMemLoc()) {
3393 VT = Ins[i].VT;
3394 EVT MemVT = VA.getLocVT();
3395
3396 const uint64_t Offset = VA.getLocMemOffset();
3397 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3398
3399 if (Arg.Flags.isByRef()) {
3400 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3401
3402 const GCNTargetMachine &TM =
3403 static_cast<const GCNTargetMachine &>(getTargetMachine());
3404 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3405 Arg.Flags.getPointerAddrSpace())) {
3408 }
3409
3410 InVals.push_back(Ptr);
3411 continue;
3412 }
3413
3414 SDValue NewArg;
3415 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3416 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3417 // In this case the argument is packed into the previous preload SGPR.
3418 int64_t AlignDownOffset = alignDown(Offset, 4);
3419 int64_t OffsetDiff = Offset - AlignDownOffset;
3420 EVT IntVT = MemVT.changeTypeToInteger();
3421
3422 const SIMachineFunctionInfo *Info =
3425 Register Reg =
3426 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3427
3428 assert(Reg);
3429 Register VReg = MRI.getLiveInVirtReg(Reg);
3430 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3431
3432 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3433 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3434
3435 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3436 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3437 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3438 Ins[i].Flags.isSExt(), &Ins[i]);
3439
3440 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3441 } else {
3442 const SIMachineFunctionInfo *Info =
3445 const SmallVectorImpl<MCRegister> &PreloadRegs =
3446 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3447
3448 SDValue Copy;
3449 if (PreloadRegs.size() == 1) {
3450 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3451 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3452 NewArg = DAG.getCopyFromReg(
3453 Chain, DL, VReg,
3455 TRI->getRegSizeInBits(*RC)));
3456
3457 } else {
3458 // If the kernarg alignment does not match the alignment of the SGPR
3459 // tuple RC that can accommodate this argument, it will be built up
3460 // via copies from from the individual SGPRs that the argument was
3461 // preloaded to.
3463 for (auto Reg : PreloadRegs) {
3464 Register VReg = MRI.getLiveInVirtReg(Reg);
3465 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3466 Elts.push_back(Copy);
3467 }
3468 NewArg =
3469 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3470 PreloadRegs.size()),
3471 DL, Elts);
3472 }
3473
3474 // If the argument was preloaded to multiple consecutive 32-bit
3475 // registers because of misalignment between addressable SGPR tuples
3476 // and the argument size, we can still assume that because of kernarg
3477 // segment alignment restrictions that NewArg's size is the same as
3478 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3479 // truncate since we cannot preload to less than a single SGPR and the
3480 // MemVT may be smaller.
3481 EVT MemVTInt =
3483 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3484 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3485
3486 NewArg = DAG.getBitcast(MemVT, NewArg);
3487 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3488 Ins[i].Flags.isSExt(), &Ins[i]);
3489 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3490 }
3491 } else {
3492 // Hidden arguments that are in the kernel signature must be preloaded
3493 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3494 // the argument list and is not preloaded.
3495 if (Arg.isOrigArg()) {
3496 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3497 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3499 *OrigArg->getParent(),
3500 "hidden argument in kernel signature was not preloaded",
3501 DL.getDebugLoc()));
3502 }
3503 }
3504
3505 NewArg =
3506 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3507 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3508 }
3509 Chains.push_back(NewArg.getValue(1));
3510
3511 auto *ParamTy =
3512 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3513 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3514 ParamTy &&
3515 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3516 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3517 // On SI local pointers are just offsets into LDS, so they are always
3518 // less than 16-bits. On CI and newer they could potentially be
3519 // real pointers, so we can't guarantee their size.
3520 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3521 DAG.getValueType(MVT::i16));
3522 }
3523
3524 InVals.push_back(NewArg);
3525 continue;
3526 }
3527 if (!IsEntryFunc && VA.isMemLoc()) {
3528 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3529 InVals.push_back(Val);
3530 if (!Arg.Flags.isByVal())
3531 Chains.push_back(Val.getValue(1));
3532 continue;
3533 }
3534
3535 assert(VA.isRegLoc() && "Parameter must be in a register!");
3536
3537 Register Reg = VA.getLocReg();
3538 const TargetRegisterClass *RC = nullptr;
3539 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3540 RC = &AMDGPU::VGPR_32RegClass;
3541 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3542 RC = &AMDGPU::SGPR_32RegClass;
3543 else
3544 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3545
3546 Reg = MF.addLiveIn(Reg, RC);
3547 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3548
3549 if (Arg.Flags.isSRet()) {
3550 // The return object should be reasonably addressable.
3551
3552 // FIXME: This helps when the return is a real sret. If it is a
3553 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3554 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3555 unsigned NumBits =
3557 Val = DAG.getNode(
3558 ISD::AssertZext, DL, VT, Val,
3559 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3560 }
3561
3562 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3563 InVals.push_back(Val);
3564 }
3565
3566 // Start adding system SGPRs.
3567 if (IsEntryFunc)
3568 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3569
3570 // DAG.getPass() returns nullptr when using new pass manager.
3571 // TODO: Use DAG.getMFAM() to access analysis result.
3572 if (DAG.getPass()) {
3573 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3574 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3575 }
3576
3577 unsigned StackArgSize = CCInfo.getStackSize();
3578 Info->setBytesInStackArgArea(StackArgSize);
3579
3580 return Chains.empty() ? Chain
3581 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3582}
3583
3584// TODO: If return values can't fit in registers, we should return as many as
3585// possible in registers before passing on stack.
3587 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3588 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3589 const Type *RetTy) const {
3590 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3591 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3592 // for shaders. Vector types should be explicitly handled by CC.
3593 if (AMDGPU::isEntryFunctionCC(CallConv))
3594 return true;
3595
3597 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3598 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3599 return false;
3600
3601 // We must use the stack if return would require unavailable registers.
3602 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3603 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3604 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3605 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3606 return false;
3607
3608 return true;
3609}
3610
3611SDValue
3613 bool isVarArg,
3615 const SmallVectorImpl<SDValue> &OutVals,
3616 const SDLoc &DL, SelectionDAG &DAG) const {
3620
3621 if (AMDGPU::isKernel(CallConv)) {
3622 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3623 OutVals, DL, DAG);
3624 }
3625
3626 bool IsShader = AMDGPU::isShader(CallConv);
3627
3628 Info->setIfReturnsVoid(Outs.empty());
3629 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3630
3631 // CCValAssign - represent the assignment of the return value to a location.
3633
3634 // CCState - Info about the registers and stack slots.
3635 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3636 *DAG.getContext());
3637
3638 // Analyze outgoing return values.
3639 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3640
3641 SDValue Glue;
3643 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3644
3645 SDValue ReadFirstLane =
3646 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3647 // Copy the result values into the output registers.
3648 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3649 ++I, ++RealRVLocIdx) {
3650 CCValAssign &VA = RVLocs[I];
3651 assert(VA.isRegLoc() && "Can only return in registers!");
3652 // TODO: Partially return in registers if return values don't fit.
3653 SDValue Arg = OutVals[RealRVLocIdx];
3654
3655 // Copied from other backends.
3656 switch (VA.getLocInfo()) {
3657 case CCValAssign::Full:
3658 break;
3659 case CCValAssign::BCvt:
3660 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3661 break;
3662 case CCValAssign::SExt:
3663 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3664 break;
3665 case CCValAssign::ZExt:
3666 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3667 break;
3668 case CCValAssign::AExt:
3669 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3670 break;
3671 default:
3672 llvm_unreachable("Unknown loc info!");
3673 }
3674 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3676 ReadFirstLane, Arg);
3677 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3678 Glue = Chain.getValue(1);
3679 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3680 }
3681
3682 // FIXME: Does sret work properly?
3683 if (!Info->isEntryFunction()) {
3684 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3685 const MCPhysReg *I =
3686 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3687 if (I) {
3688 for (; *I; ++I) {
3689 if (AMDGPU::SReg_64RegClass.contains(*I))
3690 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3691 else if (AMDGPU::SReg_32RegClass.contains(*I))
3692 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3693 else
3694 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3695 }
3696 }
3697 }
3698
3699 // Update chain and glue.
3700 RetOps[0] = Chain;
3701 if (Glue.getNode())
3702 RetOps.push_back(Glue);
3703
3704 unsigned Opc = AMDGPUISD::ENDPGM;
3705 if (!IsWaveEnd)
3706 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3707 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3709 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3710}
3711
3713 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3714 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3715 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3716 SDValue ThisVal) const {
3717 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3718
3719 // Assign locations to each value returned by this call.
3721 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3722 *DAG.getContext());
3723 CCInfo.AnalyzeCallResult(Ins, RetCC);
3724
3725 // Copy all of the result registers out of their specified physreg.
3726 for (CCValAssign VA : RVLocs) {
3727 SDValue Val;
3728
3729 if (VA.isRegLoc()) {
3730 Val =
3731 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3732 Chain = Val.getValue(1);
3733 InGlue = Val.getValue(2);
3734 } else if (VA.isMemLoc()) {
3735 report_fatal_error("TODO: return values in memory");
3736 } else
3737 llvm_unreachable("unknown argument location type");
3738
3739 switch (VA.getLocInfo()) {
3740 case CCValAssign::Full:
3741 break;
3742 case CCValAssign::BCvt:
3743 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3744 break;
3745 case CCValAssign::ZExt:
3746 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3747 DAG.getValueType(VA.getValVT()));
3748 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3749 break;
3750 case CCValAssign::SExt:
3751 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3752 DAG.getValueType(VA.getValVT()));
3753 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3754 break;
3755 case CCValAssign::AExt:
3756 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3757 break;
3758 default:
3759 llvm_unreachable("Unknown loc info!");
3760 }
3761
3762 InVals.push_back(Val);
3763 }
3764
3765 return Chain;
3766}
3767
3768// Add code to pass special inputs required depending on used features separate
3769// from the explicit user arguments present in the IR.
3771 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3772 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3773 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3774 // If we don't have a call site, this was a call inserted by
3775 // legalization. These can never use special inputs.
3776 if (!CLI.CB)
3777 return;
3778
3779 SelectionDAG &DAG = CLI.DAG;
3780 const SDLoc &DL = CLI.DL;
3781 const Function &F = DAG.getMachineFunction().getFunction();
3782
3783 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3784 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3785
3786 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3788 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3789 // DAG.getPass() returns nullptr when using new pass manager.
3790 // TODO: Use DAG.getMFAM() to access analysis result.
3791 if (DAG.getPass()) {
3792 auto &ArgUsageInfo =
3794 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3795 }
3796 }
3797
3798 // TODO: Unify with private memory register handling. This is complicated by
3799 // the fact that at least in kernels, the input argument is not necessarily
3800 // in the same location as the input.
3801 // clang-format off
3802 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3803 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3804 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3805 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3806 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3807 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3808 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3809 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3810 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3811 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3812 };
3813 // clang-format on
3814
3815 for (auto [InputID, Attrs] : ImplicitAttrs) {
3816 // If the callee does not use the attribute value, skip copying the value.
3817 if (all_of(Attrs, [&](StringRef Attr) {
3818 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3819 }))
3820 continue;
3821
3822 const auto [OutgoingArg, ArgRC, ArgTy] =
3823 CalleeArgInfo->getPreloadedValue(InputID);
3824 if (!OutgoingArg)
3825 continue;
3826
3827 const auto [IncomingArg, IncomingArgRC, Ty] =
3828 CallerArgInfo.getPreloadedValue(InputID);
3829 assert(IncomingArgRC == ArgRC);
3830
3831 // All special arguments are ints for now.
3832 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3833 SDValue InputReg;
3834
3835 if (IncomingArg) {
3836 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3837 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3838 // The implicit arg ptr is special because it doesn't have a corresponding
3839 // input for kernels, and is computed from the kernarg segment pointer.
3840 InputReg = getImplicitArgPtr(DAG, DL);
3841 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3842 std::optional<uint32_t> Id =
3844 if (Id.has_value()) {
3845 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3846 } else {
3847 InputReg = DAG.getPOISON(ArgVT);
3848 }
3849 } else {
3850 // We may have proven the input wasn't needed, although the ABI is
3851 // requiring it. We just need to allocate the register appropriately.
3852 InputReg = DAG.getPOISON(ArgVT);
3853 }
3854
3855 if (OutgoingArg->isRegister()) {
3856 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3857 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3858 report_fatal_error("failed to allocate implicit input argument");
3859 } else {
3860 unsigned SpecialArgOffset =
3861 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3862 SDValue ArgStore =
3863 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3864 MemOpChains.push_back(ArgStore);
3865 }
3866 }
3867
3868 // Pack workitem IDs into a single register or pass it as is if already
3869 // packed.
3870
3871 auto [OutgoingArg, ArgRC, Ty] =
3873 if (!OutgoingArg)
3874 std::tie(OutgoingArg, ArgRC, Ty) =
3876 if (!OutgoingArg)
3877 std::tie(OutgoingArg, ArgRC, Ty) =
3879 if (!OutgoingArg)
3880 return;
3881
3882 const ArgDescriptor *IncomingArgX = std::get<0>(
3884 const ArgDescriptor *IncomingArgY = std::get<0>(
3886 const ArgDescriptor *IncomingArgZ = std::get<0>(
3888
3889 SDValue InputReg;
3890 SDLoc SL;
3891
3892 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3893 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3894 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3895
3896 // If incoming ids are not packed we need to pack them.
3897 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3898 NeedWorkItemIDX) {
3899 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3900 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3901 } else {
3902 InputReg = DAG.getConstant(0, DL, MVT::i32);
3903 }
3904 }
3905
3906 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3907 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3908 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3909 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3910 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3911 InputReg = InputReg.getNode()
3912 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3913 : Y;
3914 }
3915
3916 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3917 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3918 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3919 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3920 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3921 InputReg = InputReg.getNode()
3922 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3923 : Z;
3924 }
3925
3926 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3927 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3928 // We're in a situation where the outgoing function requires the workitem
3929 // ID, but the calling function does not have it (e.g a graphics function
3930 // calling a C calling convention function). This is illegal, but we need
3931 // to produce something.
3932 InputReg = DAG.getPOISON(MVT::i32);
3933 } else {
3934 // Workitem ids are already packed, any of present incoming arguments
3935 // will carry all required fields.
3936 ArgDescriptor IncomingArg =
3937 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3938 : IncomingArgY ? *IncomingArgY
3939 : *IncomingArgZ,
3940 ~0u);
3941 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3942 }
3943 }
3944
3945 if (OutgoingArg->isRegister()) {
3946 if (InputReg)
3947 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3948
3949 CCInfo.AllocateReg(OutgoingArg->getRegister());
3950 } else {
3951 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3952 if (InputReg) {
3953 SDValue ArgStore =
3954 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3955 MemOpChains.push_back(ArgStore);
3956 }
3957 }
3958}
3959
3961 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3963 const SmallVectorImpl<SDValue> &OutVals,
3964 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3965 if (AMDGPU::isChainCC(CalleeCC))
3966 return true;
3967
3968 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3969 return false;
3970
3971 // For a divergent call target, we need to do a waterfall loop over the
3972 // possible callees which precludes us from using a simple jump.
3973 if (Callee->isDivergent())
3974 return false;
3975
3977 const Function &CallerF = MF.getFunction();
3978 CallingConv::ID CallerCC = CallerF.getCallingConv();
3980 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3981
3982 // Kernels aren't callable, and don't have a live in return address so it
3983 // doesn't make sense to do a tail call with entry functions.
3984 if (!CallerPreserved)
3985 return false;
3986
3987 bool CCMatch = CallerCC == CalleeCC;
3988
3990 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3991 return true;
3992 return false;
3993 }
3994
3995 // TODO: Can we handle var args?
3996 if (IsVarArg)
3997 return false;
3998
3999 for (const Argument &Arg : CallerF.args()) {
4000 if (Arg.hasByValAttr())
4001 return false;
4002 }
4003
4004 LLVMContext &Ctx = *DAG.getContext();
4005
4006 // Check that the call results are passed in the same way.
4007 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4008 CCAssignFnForCall(CalleeCC, IsVarArg),
4009 CCAssignFnForCall(CallerCC, IsVarArg)))
4010 return false;
4011
4012 // The callee has to preserve all registers the caller needs to preserve.
4013 if (!CCMatch) {
4014 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4015 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4016 return false;
4017 }
4018
4019 // Nothing more to check if the callee is taking no arguments.
4020 if (Outs.empty())
4021 return true;
4022
4024 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4025
4026 // FIXME: We are not allocating special input registers, so we will be
4027 // deciding based on incorrect register assignments.
4028 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4029
4030 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4031 // If the stack arguments for this call do not fit into our own save area then
4032 // the call cannot be made tail.
4033 // TODO: Is this really necessary?
4034 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4035 return false;
4036
4037 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4038 // FIXME: What about inreg arguments that end up passed in memory?
4039 if (!CCVA.isRegLoc())
4040 continue;
4041
4042 // If we are passing an argument in an SGPR, and the value is divergent,
4043 // this call requires a waterfall loop.
4044 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4045 LLVM_DEBUG(
4046 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4047 << printReg(CCVA.getLocReg(), TRI) << '\n');
4048 return false;
4049 }
4050 }
4051
4052 const MachineRegisterInfo &MRI = MF.getRegInfo();
4053 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4054}
4055
4057 if (!CI->isTailCall())
4058 return false;
4059
4060 const Function *ParentFn = CI->getParent()->getParent();
4062 return false;
4063 return true;
4064}
4065
4066namespace {
4067// Chain calls have special arguments that we need to handle. These are
4068// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4069// arguments (index 0 and 1 respectively).
4070enum ChainCallArgIdx {
4071 Exec = 2,
4072 Flags,
4073 NumVGPRs,
4074 FallbackExec,
4075 FallbackCallee
4076};
4077} // anonymous namespace
4078
4079// The wave scratch offset register is used as the global base pointer.
4081 SmallVectorImpl<SDValue> &InVals) const {
4082 CallingConv::ID CallConv = CLI.CallConv;
4083 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4084
4085 SelectionDAG &DAG = CLI.DAG;
4086
4087 const SDLoc &DL = CLI.DL;
4088 SDValue Chain = CLI.Chain;
4089 SDValue Callee = CLI.Callee;
4090
4091 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4092 bool UsesDynamicVGPRs = false;
4093 if (IsChainCallConv) {
4094 // The last arguments should be the value that we need to put in EXEC,
4095 // followed by the flags and any other arguments with special meanings.
4096 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4097 // we don't treat them like the "real" arguments.
4098 auto RequestedExecIt =
4099 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4100 return Arg.OrigArgIndex == 2;
4101 });
4102 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4103
4104 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4105 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4106 CLI.OutVals.end());
4107 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4108
4109 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4110 "Haven't popped all the special args");
4111
4112 TargetLowering::ArgListEntry RequestedExecArg =
4113 CLI.Args[ChainCallArgIdx::Exec];
4114 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4115 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4116
4117 // Convert constants into TargetConstants, so they become immediate operands
4118 // instead of being selected into S_MOV.
4119 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4120 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4121 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4122 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4123 } else
4124 ChainCallSpecialArgs.push_back(Arg.Node);
4125 };
4126
4127 PushNodeOrTargetConstant(RequestedExecArg);
4128
4129 // Process any other special arguments depending on the value of the flags.
4130 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4131
4132 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4133 if (FlagsValue.isZero()) {
4134 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4135 return lowerUnhandledCall(CLI, InVals,
4136 "no additional args allowed if flags == 0");
4137 } else if (FlagsValue.isOneBitSet(0)) {
4138 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4139 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4140 }
4141
4142 if (!Subtarget->isWave32()) {
4143 return lowerUnhandledCall(
4144 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4145 }
4146
4147 UsesDynamicVGPRs = true;
4148 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4149 CLI.Args.end(), PushNodeOrTargetConstant);
4150 }
4151 }
4152
4154 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4156 bool &IsTailCall = CLI.IsTailCall;
4157 bool IsVarArg = CLI.IsVarArg;
4158 bool IsSibCall = false;
4160
4161 if (Callee.isUndef() || isNullConstant(Callee)) {
4162 if (!CLI.IsTailCall) {
4163 for (ISD::InputArg &Arg : CLI.Ins)
4164 InVals.push_back(DAG.getPOISON(Arg.VT));
4165 }
4166
4167 return Chain;
4168 }
4169
4170 if (IsVarArg) {
4171 return lowerUnhandledCall(CLI, InVals,
4172 "unsupported call to variadic function ");
4173 }
4174
4175 if (!CLI.CB)
4176 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4177
4178 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4179 return lowerUnhandledCall(CLI, InVals,
4180 "unsupported required tail call to function ");
4181 }
4182
4183 if (IsTailCall) {
4184 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4185 Outs, OutVals, Ins, DAG);
4186 if (!IsTailCall &&
4187 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4188 report_fatal_error("failed to perform tail call elimination on a call "
4189 "site marked musttail or on llvm.amdgcn.cs.chain");
4190 }
4191
4192 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4193
4194 // A sibling call is one where we're under the usual C ABI and not planning
4195 // to change that but can still do a tail call:
4196 if (!TailCallOpt && IsTailCall)
4197 IsSibCall = true;
4198
4199 if (IsTailCall)
4200 ++NumTailCalls;
4201 }
4202
4205 SmallVector<SDValue, 8> MemOpChains;
4206
4207 // Analyze operands of the call, assigning locations to each operand.
4209 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4210 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4211
4212 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4214 // With a fixed ABI, allocate fixed registers before user arguments.
4215 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4216 }
4217
4218 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4219
4220 // Get a count of how many bytes are to be pushed on the stack.
4221 unsigned NumBytes = CCInfo.getStackSize();
4222
4223 if (IsSibCall) {
4224 // Since we're not changing the ABI to make this a tail call, the memory
4225 // operands are already available in the caller's incoming argument space.
4226 NumBytes = 0;
4227 }
4228
4229 // FPDiff is the byte offset of the call's argument area from the callee's.
4230 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4231 // by this amount for a tail call. In a sibling call it must be 0 because the
4232 // caller will deallocate the entire stack and the callee still expects its
4233 // arguments to begin at SP+0. Completely unused for non-tail calls.
4234 int32_t FPDiff = 0;
4235 MachineFrameInfo &MFI = MF.getFrameInfo();
4236 auto *TRI = Subtarget->getRegisterInfo();
4237
4238 // Adjust the stack pointer for the new arguments...
4239 // These operations are automatically eliminated by the prolog/epilog pass
4240 if (!IsSibCall)
4241 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4242
4243 if (!IsSibCall || IsChainCallConv) {
4244 if (!Subtarget->enableFlatScratch()) {
4245 SmallVector<SDValue, 4> CopyFromChains;
4246
4247 // In the HSA case, this should be an identity copy.
4248 SDValue ScratchRSrcReg =
4249 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4250 RegsToPass.emplace_back(IsChainCallConv
4251 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4252 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4253 ScratchRSrcReg);
4254 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4255 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4256 }
4257 }
4258
4259 const unsigned NumSpecialInputs = RegsToPass.size();
4260
4261 MVT PtrVT = MVT::i32;
4262
4263 // Walk the register/memloc assignments, inserting copies/loads.
4264 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4265 CCValAssign &VA = ArgLocs[i];
4266 SDValue Arg = OutVals[i];
4267
4268 // Promote the value if needed.
4269 switch (VA.getLocInfo()) {
4270 case CCValAssign::Full:
4271 break;
4272 case CCValAssign::BCvt:
4273 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4274 break;
4275 case CCValAssign::ZExt:
4276 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4277 break;
4278 case CCValAssign::SExt:
4279 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4280 break;
4281 case CCValAssign::AExt:
4282 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4283 break;
4284 case CCValAssign::FPExt:
4285 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4286 break;
4287 default:
4288 llvm_unreachable("Unknown loc info!");
4289 }
4290
4291 if (VA.isRegLoc()) {
4292 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4293 } else {
4294 assert(VA.isMemLoc());
4295
4296 SDValue DstAddr;
4297 MachinePointerInfo DstInfo;
4298
4299 unsigned LocMemOffset = VA.getLocMemOffset();
4300 int32_t Offset = LocMemOffset;
4301
4302 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4303 MaybeAlign Alignment;
4304
4305 if (IsTailCall) {
4306 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4307 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4308 : VA.getValVT().getStoreSize();
4309
4310 // FIXME: We can have better than the minimum byval required alignment.
4311 Alignment =
4312 Flags.isByVal()
4313 ? Flags.getNonZeroByValAlign()
4314 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4315
4316 Offset = Offset + FPDiff;
4317 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4318
4319 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4320 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4321
4322 // Make sure any stack arguments overlapping with where we're storing
4323 // are loaded before this eventual operation. Otherwise they'll be
4324 // clobbered.
4325
4326 // FIXME: Why is this really necessary? This seems to just result in a
4327 // lot of code to copy the stack and write them back to the same
4328 // locations, which are supposed to be immutable?
4329 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4330 } else {
4331 // Stores to the argument stack area are relative to the stack pointer.
4332 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4333 MVT::i32);
4334 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4335 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4336 Alignment =
4337 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4338 }
4339
4340 if (Outs[i].Flags.isByVal()) {
4341 SDValue SizeNode =
4342 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4343 SDValue Cpy =
4344 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4345 Outs[i].Flags.getNonZeroByValAlign(),
4346 /*isVol = */ false, /*AlwaysInline = */ true,
4347 /*CI=*/nullptr, std::nullopt, DstInfo,
4349
4350 MemOpChains.push_back(Cpy);
4351 } else {
4352 SDValue Store =
4353 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4354 MemOpChains.push_back(Store);
4355 }
4356 }
4357 }
4358
4359 if (!MemOpChains.empty())
4360 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4361
4362 SDValue ReadFirstLaneID =
4363 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4364
4365 SDValue TokenGlue;
4366 if (CLI.ConvergenceControlToken) {
4367 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4369 }
4370
4371 // Build a sequence of copy-to-reg nodes chained together with token chain
4372 // and flag operands which copy the outgoing args into the appropriate regs.
4373 SDValue InGlue;
4374
4375 unsigned ArgIdx = 0;
4376 for (auto [Reg, Val] : RegsToPass) {
4377 if (ArgIdx++ >= NumSpecialInputs &&
4378 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4379 // For chain calls, the inreg arguments are required to be
4380 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4381 // they are uniform.
4382 //
4383 // For other calls, if an inreg arguments is known to be uniform,
4384 // speculatively insert a readfirstlane in case it is in a VGPR.
4385 //
4386 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4387 // value, so let that continue to produce invalid code.
4388
4389 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4390 if (TokenGlue)
4391 ReadfirstlaneArgs.push_back(TokenGlue);
4393 ReadfirstlaneArgs);
4394 }
4395
4396 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4397 InGlue = Chain.getValue(1);
4398 }
4399
4400 // We don't usually want to end the call-sequence here because we would tidy
4401 // the frame up *after* the call, however in the ABI-changing tail-call case
4402 // we've carefully laid out the parameters so that when sp is reset they'll be
4403 // in the correct location.
4404 if (IsTailCall && !IsSibCall) {
4405 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4406 InGlue = Chain.getValue(1);
4407 }
4408
4409 std::vector<SDValue> Ops({Chain});
4410
4411 // Add a redundant copy of the callee global which will not be legalized, as
4412 // we need direct access to the callee later.
4414 const GlobalValue *GV = GSD->getGlobal();
4415 Ops.push_back(Callee);
4416 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4417 } else {
4418 if (IsTailCall) {
4419 // isEligibleForTailCallOptimization considered whether the call target is
4420 // divergent, but we may still end up with a uniform value in a VGPR.
4421 // Insert a readfirstlane just in case.
4422 SDValue ReadFirstLaneID =
4423 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4424
4425 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4426 if (TokenGlue)
4427 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4428 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4429 ReadfirstlaneArgs);
4430 }
4431
4432 Ops.push_back(Callee);
4433 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4434 }
4435
4436 if (IsTailCall) {
4437 // Each tail call may have to adjust the stack by a different amount, so
4438 // this information must travel along with the operation for eventual
4439 // consumption by emitEpilogue.
4440 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4441 }
4442
4443 if (IsChainCallConv)
4444 llvm::append_range(Ops, ChainCallSpecialArgs);
4445
4446 // Add argument registers to the end of the list so that they are known live
4447 // into the call.
4448 for (auto &[Reg, Val] : RegsToPass)
4449 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4450
4451 // Add a register mask operand representing the call-preserved registers.
4452 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4453 assert(Mask && "Missing call preserved mask for calling convention");
4454 Ops.push_back(DAG.getRegisterMask(Mask));
4455
4456 if (SDValue Token = CLI.ConvergenceControlToken) {
4458 GlueOps.push_back(Token);
4459 if (InGlue)
4460 GlueOps.push_back(InGlue);
4461
4462 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4463 MVT::Glue, GlueOps),
4464 0);
4465 }
4466
4467 if (InGlue)
4468 Ops.push_back(InGlue);
4469
4470 // If we're doing a tall call, use a TC_RETURN here rather than an
4471 // actual call instruction.
4472 if (IsTailCall) {
4473 MFI.setHasTailCall();
4474 unsigned OPC = AMDGPUISD::TC_RETURN;
4475 switch (CallConv) {
4478 break;
4481 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4483 break;
4484 }
4485
4486 // If the caller is a whole wave function, we need to use a special opcode
4487 // so we can patch up EXEC.
4488 if (Info->isWholeWaveFunction())
4490
4491 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4492 }
4493
4494 // Returns a chain and a flag for retval copy to use.
4495 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4496 Chain = Call.getValue(0);
4497 InGlue = Call.getValue(1);
4498
4499 uint64_t CalleePopBytes = NumBytes;
4500 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4501 if (!Ins.empty())
4502 InGlue = Chain.getValue(1);
4503
4504 // Handle result values, copying them out of physregs into vregs that we
4505 // return.
4506 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4507 InVals, /*IsThisReturn=*/false, SDValue());
4508}
4509
4510// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4511// except for:
4512// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4513// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4515 SelectionDAG &DAG) const {
4516 const MachineFunction &MF = DAG.getMachineFunction();
4518
4519 SDLoc dl(Op);
4520 EVT VT = Op.getValueType();
4521 SDValue Chain = Op.getOperand(0);
4522 Register SPReg = Info->getStackPtrOffsetReg();
4523
4524 // Chain the dynamic stack allocation so that it doesn't modify the stack
4525 // pointer when other instructions are using the stack.
4526 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4527
4528 SDValue Size = Op.getOperand(1);
4529 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4530 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4531
4532 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4534 "Stack grows upwards for AMDGPU");
4535
4536 Chain = BaseAddr.getValue(1);
4537 Align StackAlign = TFL->getStackAlign();
4538 if (Alignment > StackAlign) {
4539 uint64_t ScaledAlignment = Alignment.value()
4540 << Subtarget->getWavefrontSizeLog2();
4541 uint64_t StackAlignMask = ScaledAlignment - 1;
4542 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4543 DAG.getConstant(StackAlignMask, dl, VT));
4544 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4545 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4546 }
4547
4548 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4549 SDValue NewSP;
4551 // For constant sized alloca, scale alloca size by wave-size
4552 SDValue ScaledSize = DAG.getNode(
4553 ISD::SHL, dl, VT, Size,
4554 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4555 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4556 } else {
4557 // For dynamic sized alloca, perform wave-wide reduction to get max of
4558 // alloca size(divergent) and then scale it by wave-size
4559 SDValue WaveReduction =
4560 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4561 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4562 Size, DAG.getConstant(0, dl, MVT::i32));
4563 SDValue ScaledSize = DAG.getNode(
4564 ISD::SHL, dl, VT, Size,
4565 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4566 NewSP =
4567 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4568 SDValue ReadFirstLaneID =
4569 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4570 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4571 NewSP);
4572 }
4573
4574 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4575 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4576
4577 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4578}
4579
4581 if (Op.getValueType() != MVT::i32)
4582 return Op; // Defer to cannot select error.
4583
4585 SDLoc SL(Op);
4586
4587 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4588
4589 // Convert from wave uniform to swizzled vector address. This should protect
4590 // from any edge cases where the stacksave result isn't directly used with
4591 // stackrestore.
4592 SDValue VectorAddress =
4593 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4594 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4595}
4596
4598 SelectionDAG &DAG) const {
4599 SDLoc SL(Op);
4600 assert(Op.getValueType() == MVT::i32);
4601
4602 uint32_t BothRoundHwReg =
4604 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4605
4606 SDValue IntrinID =
4607 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4608 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4609 Op.getOperand(0), IntrinID, GetRoundBothImm);
4610
4611 // There are two rounding modes, one for f32 and one for f64/f16. We only
4612 // report in the standard value range if both are the same.
4613 //
4614 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4615 // ties away from zero is not supported, and the other values are rotated by
4616 // 1.
4617 //
4618 // If the two rounding modes are not the same, report a target defined value.
4619
4620 // Mode register rounding mode fields:
4621 //
4622 // [1:0] Single-precision round mode.
4623 // [3:2] Double/Half-precision round mode.
4624 //
4625 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4626 //
4627 // Hardware Spec
4628 // Toward-0 3 0
4629 // Nearest Even 0 1
4630 // +Inf 1 2
4631 // -Inf 2 3
4632 // NearestAway0 N/A 4
4633 //
4634 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4635 // table we can index by the raw hardware mode.
4636 //
4637 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4638
4639 SDValue BitTable =
4641
4642 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4643 SDValue RoundModeTimesNumBits =
4644 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4645
4646 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4647 // knew only one mode was demanded.
4648 SDValue TableValue =
4649 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4650 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4651
4652 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4653 SDValue TableEntry =
4654 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4655
4656 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4657 // if it's an extended value.
4658 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4659 SDValue IsStandardValue =
4660 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4661 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4662 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4663 TableEntry, EnumOffset);
4664
4665 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4666}
4667
4669 SelectionDAG &DAG) const {
4670 SDLoc SL(Op);
4671
4672 SDValue NewMode = Op.getOperand(1);
4673 assert(NewMode.getValueType() == MVT::i32);
4674
4675 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4676 // hardware MODE.fp_round values.
4677 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4678 uint32_t ClampedVal = std::min(
4679 static_cast<uint32_t>(ConstMode->getZExtValue()),
4681 NewMode = DAG.getConstant(
4682 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4683 } else {
4684 // If we know the input can only be one of the supported standard modes in
4685 // the range 0-3, we can use a simplified mapping to hardware values.
4686 KnownBits KB = DAG.computeKnownBits(NewMode);
4687 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4688 // The supported standard values are 0-3. The extended values start at 8. We
4689 // need to offset by 4 if the value is in the extended range.
4690
4691 if (UseReducedTable) {
4692 // Truncate to the low 32-bits.
4693 SDValue BitTable = DAG.getConstant(
4694 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4695
4696 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4697 SDValue RoundModeTimesNumBits =
4698 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4699
4700 NewMode =
4701 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4702
4703 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4704 // the table extracted bits into inline immediates.
4705 } else {
4706 // table_index = umin(value, value - 4)
4707 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4708 SDValue BitTable =
4710
4711 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4712 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4713 SDValue IndexVal =
4714 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4715
4716 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4717 SDValue RoundModeTimesNumBits =
4718 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4719
4720 SDValue TableValue =
4721 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4722 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4723
4724 // No need to mask out the high bits since the setreg will ignore them
4725 // anyway.
4726 NewMode = TruncTable;
4727 }
4728
4729 // Insert a readfirstlane in case the value is a VGPR. We could do this
4730 // earlier and keep more operations scalar, but that interferes with
4731 // combining the source.
4732 SDValue ReadFirstLaneID =
4733 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4734 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4735 ReadFirstLaneID, NewMode);
4736 }
4737
4738 // N.B. The setreg will be later folded into s_round_mode on supported
4739 // targets.
4740 SDValue IntrinID =
4741 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4742 uint32_t BothRoundHwReg =
4744 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4745
4746 SDValue SetReg =
4747 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4748 IntrinID, RoundBothImm, NewMode);
4749
4750 return SetReg;
4751}
4752
4754 if (Op->isDivergent() &&
4755 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4756 // Cannot do I$ prefetch with divergent pointer.
4757 return SDValue();
4758
4759 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4763 break;
4765 if (Subtarget->hasSafeSmemPrefetch())
4766 break;
4767 [[fallthrough]];
4768 default:
4769 return SDValue();
4770 }
4771
4772 // I$ prefetch
4773 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4774 return SDValue();
4775
4776 return Op;
4777}
4778
4779// Work around DAG legality rules only based on the result type.
4781 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4782 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4783 EVT SrcVT = Src.getValueType();
4784
4785 if (SrcVT.getScalarType() != MVT::bf16)
4786 return Op;
4787
4788 SDLoc SL(Op);
4789 SDValue BitCast =
4790 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4791
4792 EVT DstVT = Op.getValueType();
4793 if (IsStrict)
4794 llvm_unreachable("Need STRICT_BF16_TO_FP");
4795
4796 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4797}
4798
4800 SDLoc SL(Op);
4801 if (Op.getValueType() != MVT::i64)
4802 return Op;
4803
4804 uint32_t ModeHwReg =
4806 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4807 uint32_t TrapHwReg =
4809 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4810
4811 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4812 SDValue IntrinID =
4813 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4814 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4815 Op.getOperand(0), IntrinID, ModeHwRegImm);
4816 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4817 Op.getOperand(0), IntrinID, TrapHwRegImm);
4818 SDValue TokenReg =
4819 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4820 GetTrapReg.getValue(1));
4821
4822 SDValue CvtPtr =
4823 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4824 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4825
4826 return DAG.getMergeValues({Result, TokenReg}, SL);
4827}
4828
4830 SDLoc SL(Op);
4831 if (Op.getOperand(1).getValueType() != MVT::i64)
4832 return Op;
4833
4834 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4835 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4836 DAG.getConstant(0, SL, MVT::i32));
4837 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4838 DAG.getConstant(1, SL, MVT::i32));
4839
4840 SDValue ReadFirstLaneID =
4841 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4842 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4843 ReadFirstLaneID, NewModeReg);
4844 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4845 ReadFirstLaneID, NewTrapReg);
4846
4847 unsigned ModeHwReg =
4849 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4850 unsigned TrapHwReg =
4852 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4853
4854 SDValue IntrinID =
4855 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4856 SDValue SetModeReg =
4857 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4858 IntrinID, ModeHwRegImm, NewModeReg);
4859 SDValue SetTrapReg =
4860 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4861 IntrinID, TrapHwRegImm, NewTrapReg);
4862 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4863}
4864
4866 const MachineFunction &MF) const {
4867 const Function &Fn = MF.getFunction();
4868
4870 .Case("m0", AMDGPU::M0)
4871 .Case("exec", AMDGPU::EXEC)
4872 .Case("exec_lo", AMDGPU::EXEC_LO)
4873 .Case("exec_hi", AMDGPU::EXEC_HI)
4874 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4875 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4876 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4877 .Default(Register());
4878 if (!Reg)
4879 return Reg;
4880
4881 if (!Subtarget->hasFlatScrRegister() &&
4882 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4883 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4884 "\" for subtarget."));
4885 }
4886
4887 switch (Reg) {
4888 case AMDGPU::M0:
4889 case AMDGPU::EXEC_LO:
4890 case AMDGPU::EXEC_HI:
4891 case AMDGPU::FLAT_SCR_LO:
4892 case AMDGPU::FLAT_SCR_HI:
4893 if (VT.getSizeInBits() == 32)
4894 return Reg;
4895 break;
4896 case AMDGPU::EXEC:
4897 case AMDGPU::FLAT_SCR:
4898 if (VT.getSizeInBits() == 64)
4899 return Reg;
4900 break;
4901 default:
4902 llvm_unreachable("missing register type checking");
4903 }
4904
4906 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4907}
4908
4909// If kill is not the last instruction, split the block so kill is always a
4910// proper terminator.
4913 MachineBasicBlock *BB) const {
4914 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4916 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4917 return SplitBB;
4918}
4919
4920// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4921// \p MI will be the only instruction in the loop body block. Otherwise, it will
4922// be the first instruction in the remainder block.
4923//
4924/// \returns { LoopBody, Remainder }
4925static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4927 MachineFunction *MF = MBB.getParent();
4929
4930 // To insert the loop we need to split the block. Move everything after this
4931 // point to a new block, and insert a new empty block between the two.
4933 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4935 ++MBBI;
4936
4937 MF->insert(MBBI, LoopBB);
4938 MF->insert(MBBI, RemainderBB);
4939
4940 LoopBB->addSuccessor(LoopBB);
4941 LoopBB->addSuccessor(RemainderBB);
4942
4943 // Move the rest of the block into a new block.
4944 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4945
4946 if (InstInLoop) {
4947 auto Next = std::next(I);
4948
4949 // Move instruction to loop body.
4950 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4951
4952 // Move the rest of the block.
4953 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4954 } else {
4955 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4956 }
4957
4958 MBB.addSuccessor(LoopBB);
4959
4960 return std::pair(LoopBB, RemainderBB);
4961}
4962
4963/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4965 MachineBasicBlock *MBB = MI.getParent();
4967 auto I = MI.getIterator();
4968 auto E = std::next(I);
4969
4970 // clang-format off
4971 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4972 .addImm(0);
4973 // clang-format on
4974
4975 MIBundleBuilder Bundler(*MBB, I, E);
4976 finalizeBundle(*MBB, Bundler.begin());
4977}
4978
4981 MachineBasicBlock *BB) const {
4982 const DebugLoc &DL = MI.getDebugLoc();
4983
4985
4987
4988 // Apparently kill flags are only valid if the def is in the same block?
4989 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4990 Src->setIsKill(false);
4991
4992 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4993
4994 MachineBasicBlock::iterator I = LoopBB->end();
4995
4996 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4998
4999 // Clear TRAP_STS.MEM_VIOL
5000 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5001 .addImm(0)
5002 .addImm(EncodedReg);
5003
5005
5006 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5007
5008 // Load and check TRAP_STS.MEM_VIOL
5009 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5010 .addImm(EncodedReg);
5011
5012 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5013 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5014 .addReg(Reg, RegState::Kill)
5015 .addImm(0);
5016 // clang-format off
5017 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5018 .addMBB(LoopBB);
5019 // clang-format on
5020
5021 return RemainderBB;
5022}
5023
5024// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5025// wavefront. If the value is uniform and just happens to be in a VGPR, this
5026// will only do one iteration. In the worst case, this will loop 64 times.
5027//
5028// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5031 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5032 const DebugLoc &DL, const MachineOperand &Idx,
5033 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5034 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5035 Register &SGPRIdxReg) {
5036
5037 MachineFunction *MF = OrigBB.getParent();
5038 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5039 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5042
5043 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5044 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5045 Register NewExec = MRI.createVirtualRegister(BoolRC);
5046 Register CurrentIdxReg =
5047 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5048 Register CondReg = MRI.createVirtualRegister(BoolRC);
5049
5050 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5051 .addReg(InitReg)
5052 .addMBB(&OrigBB)
5053 .addReg(ResultReg)
5054 .addMBB(&LoopBB);
5055
5056 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5057 .addReg(InitSaveExecReg)
5058 .addMBB(&OrigBB)
5059 .addReg(NewExec)
5060 .addMBB(&LoopBB);
5061
5062 // Read the next variant <- also loop target.
5063 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5064 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5065
5066 // Compare the just read M0 value to all possible Idx values.
5067 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5068 .addReg(CurrentIdxReg)
5069 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5070
5071 // Update EXEC, save the original EXEC value to VCC.
5072 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5073 .addReg(CondReg, RegState::Kill);
5074
5075 MRI.setSimpleHint(NewExec, CondReg);
5076
5077 if (UseGPRIdxMode) {
5078 if (Offset == 0) {
5079 SGPRIdxReg = CurrentIdxReg;
5080 } else {
5081 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5082 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5083 .addReg(CurrentIdxReg, RegState::Kill)
5084 .addImm(Offset);
5085 }
5086 } else {
5087 // Move index from VCC into M0
5088 if (Offset == 0) {
5089 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5090 .addReg(CurrentIdxReg, RegState::Kill);
5091 } else {
5092 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5093 .addReg(CurrentIdxReg, RegState::Kill)
5094 .addImm(Offset);
5095 }
5096 }
5097
5098 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5099 MachineInstr *InsertPt =
5100 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5101 .addReg(LMC.ExecReg)
5102 .addReg(NewExec);
5103
5104 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5105 // s_cbranch_scc0?
5106
5107 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5108 // clang-format off
5109 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5110 .addMBB(&LoopBB);
5111 // clang-format on
5112
5113 return InsertPt->getIterator();
5114}
5115
5116// This has slightly sub-optimal regalloc when the source vector is killed by
5117// the read. The register allocator does not understand that the kill is
5118// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5119// subregister from it, using 1 more VGPR than necessary. This was saved when
5120// this was expanded after register allocation.
5123 unsigned InitResultReg, unsigned PhiReg, int Offset,
5124 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5125 MachineFunction *MF = MBB.getParent();
5126 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5127 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5129 const DebugLoc &DL = MI.getDebugLoc();
5131
5132 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5133 Register DstReg = MI.getOperand(0).getReg();
5134 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5135 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5137
5138 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5139
5140 // Save the EXEC mask
5141 // clang-format off
5142 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5143 .addReg(LMC.ExecReg);
5144 // clang-format on
5145
5146 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5147
5148 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5149
5150 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5151 InitResultReg, DstReg, PhiReg, TmpExec,
5152 Offset, UseGPRIdxMode, SGPRIdxReg);
5153
5154 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5156 ++MBBI;
5157 MF->insert(MBBI, LandingPad);
5158 LoopBB->removeSuccessor(RemainderBB);
5159 LandingPad->addSuccessor(RemainderBB);
5160 LoopBB->addSuccessor(LandingPad);
5161 MachineBasicBlock::iterator First = LandingPad->begin();
5162 // clang-format off
5163 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5164 .addReg(SaveExec);
5165 // clang-format on
5166
5167 return InsPt;
5168}
5169
5170// Returns subreg index, offset
5171static std::pair<unsigned, int>
5173 const TargetRegisterClass *SuperRC, unsigned VecReg,
5174 int Offset) {
5175 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5176
5177 // Skip out of bounds offsets, or else we would end up using an undefined
5178 // register.
5179 if (Offset >= NumElts || Offset < 0)
5180 return std::pair(AMDGPU::sub0, Offset);
5181
5182 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5183}
5184
5187 int Offset) {
5188 MachineBasicBlock *MBB = MI.getParent();
5189 const DebugLoc &DL = MI.getDebugLoc();
5191
5192 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5193
5194 assert(Idx->getReg() != AMDGPU::NoRegister);
5195
5196 if (Offset == 0) {
5197 // clang-format off
5198 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5199 .add(*Idx);
5200 // clang-format on
5201 } else {
5202 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5203 .add(*Idx)
5204 .addImm(Offset);
5205 }
5206}
5207
5210 int Offset) {
5211 MachineBasicBlock *MBB = MI.getParent();
5212 const DebugLoc &DL = MI.getDebugLoc();
5214
5215 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5216
5217 if (Offset == 0)
5218 return Idx->getReg();
5219
5220 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5221 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5222 .add(*Idx)
5223 .addImm(Offset);
5224 return Tmp;
5225}
5226
5229 const GCNSubtarget &ST) {
5230 const SIInstrInfo *TII = ST.getInstrInfo();
5231 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5232 MachineFunction *MF = MBB.getParent();
5234
5235 Register Dst = MI.getOperand(0).getReg();
5236 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5237 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5238 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5239
5240 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5241 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5242
5243 unsigned SubReg;
5244 std::tie(SubReg, Offset) =
5245 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5246
5247 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5248
5249 // Check for a SGPR index.
5250 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5252 const DebugLoc &DL = MI.getDebugLoc();
5253
5254 if (UseGPRIdxMode) {
5255 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5256 // to avoid interfering with other uses, so probably requires a new
5257 // optimization pass.
5259
5260 const MCInstrDesc &GPRIDXDesc =
5261 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5262 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5263 .addReg(SrcReg)
5264 .addReg(Idx)
5265 .addImm(SubReg);
5266 } else {
5268
5269 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5270 .addReg(SrcReg, 0, SubReg)
5271 .addReg(SrcReg, RegState::Implicit);
5272 }
5273
5274 MI.eraseFromParent();
5275
5276 return &MBB;
5277 }
5278
5279 // Control flow needs to be inserted if indexing with a VGPR.
5280 const DebugLoc &DL = MI.getDebugLoc();
5282
5283 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5284 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5285
5286 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5287
5288 Register SGPRIdxReg;
5289 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5290 UseGPRIdxMode, SGPRIdxReg);
5291
5292 MachineBasicBlock *LoopBB = InsPt->getParent();
5293
5294 if (UseGPRIdxMode) {
5295 const MCInstrDesc &GPRIDXDesc =
5296 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5297
5298 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5299 .addReg(SrcReg)
5300 .addReg(SGPRIdxReg)
5301 .addImm(SubReg);
5302 } else {
5303 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5304 .addReg(SrcReg, 0, SubReg)
5305 .addReg(SrcReg, RegState::Implicit);
5306 }
5307
5308 MI.eraseFromParent();
5309
5310 return LoopBB;
5311}
5312
5315 const GCNSubtarget &ST) {
5316 const SIInstrInfo *TII = ST.getInstrInfo();
5317 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5318 MachineFunction *MF = MBB.getParent();
5320
5321 Register Dst = MI.getOperand(0).getReg();
5322 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5323 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5324 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5325 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5326 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5327 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5328
5329 // This can be an immediate, but will be folded later.
5330 assert(Val->getReg());
5331
5332 unsigned SubReg;
5333 std::tie(SubReg, Offset) =
5334 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5335 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5336
5337 if (Idx->getReg() == AMDGPU::NoRegister) {
5339 const DebugLoc &DL = MI.getDebugLoc();
5340
5341 assert(Offset == 0);
5342
5343 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5344 .add(*SrcVec)
5345 .add(*Val)
5346 .addImm(SubReg);
5347
5348 MI.eraseFromParent();
5349 return &MBB;
5350 }
5351
5352 // Check for a SGPR index.
5353 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5355 const DebugLoc &DL = MI.getDebugLoc();
5356
5357 if (UseGPRIdxMode) {
5359
5360 const MCInstrDesc &GPRIDXDesc =
5361 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5362 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5363 .addReg(SrcVec->getReg())
5364 .add(*Val)
5365 .addReg(Idx)
5366 .addImm(SubReg);
5367 } else {
5369
5370 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5371 TRI.getRegSizeInBits(*VecRC), 32, false);
5372 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5373 .addReg(SrcVec->getReg())
5374 .add(*Val)
5375 .addImm(SubReg);
5376 }
5377 MI.eraseFromParent();
5378 return &MBB;
5379 }
5380
5381 // Control flow needs to be inserted if indexing with a VGPR.
5382 if (Val->isReg())
5383 MRI.clearKillFlags(Val->getReg());
5384
5385 const DebugLoc &DL = MI.getDebugLoc();
5386
5387 Register PhiReg = MRI.createVirtualRegister(VecRC);
5388
5389 Register SGPRIdxReg;
5390 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5391 UseGPRIdxMode, SGPRIdxReg);
5392 MachineBasicBlock *LoopBB = InsPt->getParent();
5393
5394 if (UseGPRIdxMode) {
5395 const MCInstrDesc &GPRIDXDesc =
5396 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5397
5398 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5399 .addReg(PhiReg)
5400 .add(*Val)
5401 .addReg(SGPRIdxReg)
5402 .addImm(SubReg);
5403 } else {
5404 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5405 TRI.getRegSizeInBits(*VecRC), 32, false);
5406 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5407 .addReg(PhiReg)
5408 .add(*Val)
5409 .addImm(SubReg);
5410 }
5411
5412 MI.eraseFromParent();
5413 return LoopBB;
5414}
5415
5417 MachineBasicBlock *BB) {
5418 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5419 // For GFX12, we emit s_add_u64 and s_sub_u64.
5420 MachineFunction *MF = BB->getParent();
5421 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5422 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5424 const DebugLoc &DL = MI.getDebugLoc();
5425 MachineOperand &Dest = MI.getOperand(0);
5426 MachineOperand &Src0 = MI.getOperand(1);
5427 MachineOperand &Src1 = MI.getOperand(2);
5428 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5429 if (ST.hasScalarAddSub64()) {
5430 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5431 // clang-format off
5432 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5433 .add(Src0)
5434 .add(Src1);
5435 // clang-format on
5436 } else {
5437 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5438 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5439
5440 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5441 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5442
5443 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5444 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5445 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5446 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5447
5448 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5449 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5450 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5451 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5452
5453 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5454 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5455 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5456 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5457 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5458 .addReg(DestSub0)
5459 .addImm(AMDGPU::sub0)
5460 .addReg(DestSub1)
5461 .addImm(AMDGPU::sub1);
5462 }
5463 MI.eraseFromParent();
5464 return BB;
5465}
5466
5468 switch (Opc) {
5469 case AMDGPU::S_MIN_U32:
5470 return std::numeric_limits<uint32_t>::max();
5471 case AMDGPU::S_MIN_I32:
5472 return std::numeric_limits<int32_t>::max();
5473 case AMDGPU::S_MAX_U32:
5474 return std::numeric_limits<uint32_t>::min();
5475 case AMDGPU::S_MAX_I32:
5476 return std::numeric_limits<int32_t>::min();
5477 case AMDGPU::S_ADD_I32:
5478 case AMDGPU::S_SUB_I32:
5479 case AMDGPU::S_OR_B32:
5480 case AMDGPU::S_XOR_B32:
5481 return std::numeric_limits<uint32_t>::min();
5482 case AMDGPU::S_AND_B32:
5483 return std::numeric_limits<uint32_t>::max();
5484 default:
5486 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5487 }
5488}
5489
5491 switch (Opc) {
5492 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5493 return std::numeric_limits<uint64_t>::max();
5494 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5495 return std::numeric_limits<int64_t>::max();
5496 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5497 return std::numeric_limits<uint64_t>::min();
5498 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5499 return std::numeric_limits<int64_t>::min();
5500 case AMDGPU::S_ADD_U64_PSEUDO:
5501 case AMDGPU::S_SUB_U64_PSEUDO:
5502 case AMDGPU::S_OR_B64:
5503 case AMDGPU::S_XOR_B64:
5504 return std::numeric_limits<uint64_t>::min();
5505 case AMDGPU::S_AND_B64:
5506 return std::numeric_limits<uint64_t>::max();
5507 default:
5509 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5510 }
5511}
5512
5513static bool is32bitWaveReduceOperation(unsigned Opc) {
5514 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5515 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5516 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5517 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5518 Opc == AMDGPU::S_XOR_B32;
5519}
5520
5523 const GCNSubtarget &ST,
5524 unsigned Opc) {
5526 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5527 const DebugLoc &DL = MI.getDebugLoc();
5528 const SIInstrInfo *TII = ST.getInstrInfo();
5529
5530 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5531 Register SrcReg = MI.getOperand(1).getReg();
5532 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5533 Register DstReg = MI.getOperand(0).getReg();
5534 MachineBasicBlock *RetBB = nullptr;
5535 if (isSGPR) {
5536 switch (Opc) {
5537 case AMDGPU::S_MIN_U32:
5538 case AMDGPU::S_MIN_I32:
5539 case AMDGPU::S_MAX_U32:
5540 case AMDGPU::S_MAX_I32:
5541 case AMDGPU::S_AND_B32:
5542 case AMDGPU::S_OR_B32: {
5543 // Idempotent operations.
5544 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5545 RetBB = &BB;
5546 break;
5547 }
5548 case AMDGPU::V_CMP_LT_U64_e64: // umin
5549 case AMDGPU::V_CMP_LT_I64_e64: // min
5550 case AMDGPU::V_CMP_GT_U64_e64: // umax
5551 case AMDGPU::V_CMP_GT_I64_e64: // max
5552 case AMDGPU::S_AND_B64:
5553 case AMDGPU::S_OR_B64: {
5554 // Idempotent operations.
5555 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5556 RetBB = &BB;
5557 break;
5558 }
5559 case AMDGPU::S_XOR_B32:
5560 case AMDGPU::S_XOR_B64:
5561 case AMDGPU::S_ADD_I32:
5562 case AMDGPU::S_ADD_U64_PSEUDO:
5563 case AMDGPU::S_SUB_I32:
5564 case AMDGPU::S_SUB_U64_PSEUDO: {
5565 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5566 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5567 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5568 Register NumActiveLanes =
5569 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5570
5571 bool IsWave32 = ST.isWave32();
5572 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5573 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5574 unsigned BitCountOpc =
5575 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5576
5577 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5578
5579 auto NewAccumulator =
5580 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5581 .addReg(ExecMask);
5582
5583 switch (Opc) {
5584 case AMDGPU::S_XOR_B32:
5585 case AMDGPU::S_XOR_B64: {
5586 // Performing an XOR operation on a uniform value
5587 // depends on the parity of the number of active lanes.
5588 // For even parity, the result will be 0, for odd
5589 // parity the result will be the same as the input value.
5590 Register ParityRegister =
5591 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5592
5593 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5594 .addReg(NewAccumulator->getOperand(0).getReg())
5595 .addImm(1)
5596 .setOperandDead(3); // Dead scc
5597 if (Opc == AMDGPU::S_XOR_B32) {
5598 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5599 .addReg(SrcReg)
5600 .addReg(ParityRegister);
5601 } else {
5602 Register DestSub0 =
5603 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5604 Register DestSub1 =
5605 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5606
5607 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5608 const TargetRegisterClass *SrcSubRC =
5609 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5610
5611 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5612 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5613 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5614 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5615
5616 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5617 .add(Op1L)
5618 .addReg(ParityRegister);
5619
5620 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5621 .add(Op1H)
5622 .addReg(ParityRegister);
5623
5624 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5625 .addReg(DestSub0)
5626 .addImm(AMDGPU::sub0)
5627 .addReg(DestSub1)
5628 .addImm(AMDGPU::sub1);
5629 }
5630 break;
5631 }
5632 case AMDGPU::S_SUB_I32: {
5633 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5634
5635 // Take the negation of the source operand.
5636 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5637 .addImm(0)
5638 .addReg(SrcReg);
5639 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5640 .addReg(NegatedVal)
5641 .addReg(NewAccumulator->getOperand(0).getReg());
5642 break;
5643 }
5644 case AMDGPU::S_ADD_I32: {
5645 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5646 .addReg(SrcReg)
5647 .addReg(NewAccumulator->getOperand(0).getReg());
5648 break;
5649 }
5650 case AMDGPU::S_ADD_U64_PSEUDO:
5651 case AMDGPU::S_SUB_U64_PSEUDO: {
5652 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 Register Op1H_Op0L_Reg =
5655 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656 Register Op1L_Op0H_Reg =
5657 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5658 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5659 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5660 Register NegatedValLo =
5661 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5662 Register NegatedValHi =
5663 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5664
5665 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5666 const TargetRegisterClass *Src1SubRC =
5667 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5668
5669 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5670 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5671 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5672 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5673
5674 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5675 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5676 .addImm(0)
5677 .addReg(NewAccumulator->getOperand(0).getReg())
5678 .setOperandDead(3); // Dead scc
5679 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5680 .addReg(NegatedValLo)
5681 .addImm(31)
5682 .setOperandDead(3); // Dead scc
5683 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5684 .add(Op1L)
5685 .addReg(NegatedValHi);
5686 }
5687 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5688 ? NegatedValLo
5689 : NewAccumulator->getOperand(0).getReg();
5690 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5691 .add(Op1L)
5692 .addReg(LowOpcode);
5693 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5694 .add(Op1L)
5695 .addReg(LowOpcode);
5696 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5697 .add(Op1H)
5698 .addReg(LowOpcode);
5699
5700 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5701 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5702 .addReg(CarryReg)
5703 .addReg(Op1H_Op0L_Reg)
5704 .setOperandDead(3); // Dead scc
5705
5706 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5707 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5708 .addReg(HiVal)
5709 .addReg(Op1L_Op0H_Reg)
5710 .setOperandDead(3); // Dead scc
5711 }
5712 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5713 .addReg(DestSub0)
5714 .addImm(AMDGPU::sub0)
5715 .addReg(DestSub1)
5716 .addImm(AMDGPU::sub1);
5717 break;
5718 }
5719 }
5720 RetBB = &BB;
5721 }
5722 }
5723 } else {
5724 // TODO: Implement DPP Strategy and switch based on immediate strategy
5725 // operand. For now, for all the cases (default, Iterative and DPP we use
5726 // iterative approach by default.)
5727
5728 // To reduce the VGPR using iterative approach, we need to iterate
5729 // over all the active lanes. Lowering consists of ComputeLoop,
5730 // which iterate over only active lanes. We use copy of EXEC register
5731 // as induction variable and every active lane modifies it using bitset0
5732 // so that we will get the next active lane for next iteration.
5734 Register SrcReg = MI.getOperand(1).getReg();
5735 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5736
5737 // Create Control flow for loop
5738 // Split MI's Machine Basic block into For loop
5739 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5740
5741 // Create virtual registers required for lowering.
5742 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5743 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5744 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5745 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5746 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5747 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5748 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5749 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5750 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5751
5752 bool IsWave32 = ST.isWave32();
5753 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5754 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5755
5756 // Create initial values of induction variable from Exec, Accumulator and
5757 // insert branch instr to newly created ComputeBlock
5758 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5759 if (is32BitOpc) {
5761 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5762 .addImm(IdentityValue);
5763 } else {
5765 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5766 .addImm(IdentityValue);
5767 }
5768 // clang-format off
5769 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5770 .addMBB(ComputeLoop);
5771 // clang-format on
5772
5773 // Start constructing ComputeLoop
5774 I = ComputeLoop->begin();
5775 auto Accumulator =
5776 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5777 .addReg(IdentityValReg)
5778 .addMBB(&BB);
5779 auto ActiveBits =
5780 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5781 .addReg(LoopIterator)
5782 .addMBB(&BB);
5783
5784 I = ComputeLoop->end();
5785 MachineInstr *NewAccumulator;
5786 // Perform the computations
5787 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5788 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5789 .addReg(ActiveBitsReg);
5790 if (is32BitOpc) {
5791 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5792 LaneValueReg)
5793 .addReg(SrcReg)
5794 .addReg(FF1Reg);
5795 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5796 .addReg(Accumulator->getOperand(0).getReg())
5797 .addReg(LaneValueReg);
5798 } else {
5799 Register LaneValueLoReg =
5800 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5801 Register LaneValueHiReg =
5802 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5803 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5804 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5805 const TargetRegisterClass *SrcSubRC =
5806 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5807 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5808 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5809 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5810 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5811 // lane value input should be in an sgpr
5812 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5813 LaneValueLoReg)
5814 .add(Op1L)
5815 .addReg(FF1Reg);
5816 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5817 LaneValueHiReg)
5818 .add(Op1H)
5819 .addReg(FF1Reg);
5820 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5821 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5822 .addReg(LaneValueLoReg)
5823 .addImm(AMDGPU::sub0)
5824 .addReg(LaneValueHiReg)
5825 .addImm(AMDGPU::sub1);
5826 switch (Opc) {
5827 case AMDGPU::S_OR_B64:
5828 case AMDGPU::S_AND_B64:
5829 case AMDGPU::S_XOR_B64: {
5830 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5831 .addReg(Accumulator->getOperand(0).getReg())
5832 .addReg(LaneValue->getOperand(0).getReg())
5833 .setOperandDead(3); // Dead scc
5834 break;
5835 }
5836 case AMDGPU::V_CMP_GT_I64_e64:
5837 case AMDGPU::V_CMP_GT_U64_e64:
5838 case AMDGPU::V_CMP_LT_I64_e64:
5839 case AMDGPU::V_CMP_LT_U64_e64: {
5840 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5841 Register ComparisonResultReg =
5842 MRI.createVirtualRegister(WaveMaskRegClass);
5843 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5844 const TargetRegisterClass *VSubRegClass =
5845 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5846 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5847 MachineOperand SrcReg0Sub0 =
5848 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5849 VregClass, AMDGPU::sub0, VSubRegClass);
5850 MachineOperand SrcReg0Sub1 =
5851 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5852 VregClass, AMDGPU::sub1, VSubRegClass);
5853 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5854 AccumulatorVReg)
5855 .add(SrcReg0Sub0)
5856 .addImm(AMDGPU::sub0)
5857 .add(SrcReg0Sub1)
5858 .addImm(AMDGPU::sub1);
5859 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5860 .addReg(LaneValue->getOperand(0).getReg())
5861 .addReg(AccumulatorVReg);
5862
5863 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5864 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5865 .addReg(LaneMaskReg)
5866 .addReg(ActiveBitsReg);
5867
5868 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5869 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5870 .addReg(LaneValue->getOperand(0).getReg())
5871 .addReg(Accumulator->getOperand(0).getReg());
5872 break;
5873 }
5874 case AMDGPU::S_ADD_U64_PSEUDO:
5875 case AMDGPU::S_SUB_U64_PSEUDO: {
5876 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5877 .addReg(Accumulator->getOperand(0).getReg())
5878 .addReg(LaneValue->getOperand(0).getReg());
5879 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5880 break;
5881 }
5882 }
5883 }
5884 // Manipulate the iterator to get the next active lane
5885 unsigned BITSETOpc =
5886 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5887 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5888 .addReg(FF1Reg)
5889 .addReg(ActiveBitsReg);
5890
5891 // Add phi nodes
5892 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5893 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5894
5895 // Creating branching
5896 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5897 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5898 .addReg(NewActiveBitsReg)
5899 .addImm(0);
5900 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5901 .addMBB(ComputeLoop);
5902
5903 RetBB = ComputeEnd;
5904 }
5905 MI.eraseFromParent();
5906 return RetBB;
5907}
5908
5911 MachineBasicBlock *BB) const {
5912 MachineFunction *MF = BB->getParent();
5914 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5916 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5918 const DebugLoc &DL = MI.getDebugLoc();
5919
5920 switch (MI.getOpcode()) {
5921 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5922 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5923 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5924 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5925 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5927 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5929 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5930 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5931 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5932 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5933 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5934 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5935 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5936 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5937 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5938 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5939 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5940 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5941 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5942 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5943 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5944 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5945 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5946 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5947 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5948 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5949 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5950 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5951 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5952 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5953 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5954 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5955 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5956 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5957 case AMDGPU::S_UADDO_PSEUDO:
5958 case AMDGPU::S_USUBO_PSEUDO: {
5959 MachineOperand &Dest0 = MI.getOperand(0);
5960 MachineOperand &Dest1 = MI.getOperand(1);
5961 MachineOperand &Src0 = MI.getOperand(2);
5962 MachineOperand &Src1 = MI.getOperand(3);
5963
5964 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5965 ? AMDGPU::S_ADD_U32
5966 : AMDGPU::S_SUB_U32;
5967 // clang-format off
5968 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5969 .add(Src0)
5970 .add(Src1);
5971 // clang-format on
5972
5973 unsigned SelOpc =
5974 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5975 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
5976
5977 MI.eraseFromParent();
5978 return BB;
5979 }
5980 case AMDGPU::S_ADD_U64_PSEUDO:
5981 case AMDGPU::S_SUB_U64_PSEUDO: {
5982 return Expand64BitScalarArithmetic(MI, BB);
5983 }
5984 case AMDGPU::V_ADD_U64_PSEUDO:
5985 case AMDGPU::V_SUB_U64_PSEUDO: {
5986 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5987
5988 MachineOperand &Dest = MI.getOperand(0);
5989 MachineOperand &Src0 = MI.getOperand(1);
5990 MachineOperand &Src1 = MI.getOperand(2);
5991
5992 if (ST.hasAddSubU64Insts()) {
5993 auto I = BuildMI(*BB, MI, DL,
5994 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5995 : AMDGPU::V_SUB_U64_e64),
5996 Dest.getReg())
5997 .add(Src0)
5998 .add(Src1)
5999 .addImm(0); // clamp
6000 TII->legalizeOperands(*I);
6001 MI.eraseFromParent();
6002 return BB;
6003 }
6004
6005 if (IsAdd && ST.hasLshlAddU64Inst()) {
6006 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6007 Dest.getReg())
6008 .add(Src0)
6009 .addImm(0)
6010 .add(Src1);
6011 TII->legalizeOperands(*Add);
6012 MI.eraseFromParent();
6013 return BB;
6014 }
6015
6016 const auto *CarryRC = TRI->getWaveMaskRegClass();
6017
6018 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6019 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6020
6021 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6022 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6023
6024 const TargetRegisterClass *Src0RC = Src0.isReg()
6025 ? MRI.getRegClass(Src0.getReg())
6026 : &AMDGPU::VReg_64RegClass;
6027 const TargetRegisterClass *Src1RC = Src1.isReg()
6028 ? MRI.getRegClass(Src1.getReg())
6029 : &AMDGPU::VReg_64RegClass;
6030
6031 const TargetRegisterClass *Src0SubRC =
6032 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6033 const TargetRegisterClass *Src1SubRC =
6034 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6035
6036 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6037 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6038 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6039 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6040
6041 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6042 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6043 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6044 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6045
6046 unsigned LoOpc =
6047 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6048 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6049 .addReg(CarryReg, RegState::Define)
6050 .add(SrcReg0Sub0)
6051 .add(SrcReg1Sub0)
6052 .addImm(0); // clamp bit
6053
6054 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6055 MachineInstr *HiHalf =
6056 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6057 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6058 .add(SrcReg0Sub1)
6059 .add(SrcReg1Sub1)
6060 .addReg(CarryReg, RegState::Kill)
6061 .addImm(0); // clamp bit
6062
6063 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6064 .addReg(DestSub0)
6065 .addImm(AMDGPU::sub0)
6066 .addReg(DestSub1)
6067 .addImm(AMDGPU::sub1);
6068 TII->legalizeOperands(*LoHalf);
6069 TII->legalizeOperands(*HiHalf);
6070 MI.eraseFromParent();
6071 return BB;
6072 }
6073 case AMDGPU::S_ADD_CO_PSEUDO:
6074 case AMDGPU::S_SUB_CO_PSEUDO: {
6075 // This pseudo has a chance to be selected
6076 // only from uniform add/subcarry node. All the VGPR operands
6077 // therefore assumed to be splat vectors.
6079 MachineOperand &Dest = MI.getOperand(0);
6080 MachineOperand &CarryDest = MI.getOperand(1);
6081 MachineOperand &Src0 = MI.getOperand(2);
6082 MachineOperand &Src1 = MI.getOperand(3);
6083 MachineOperand &Src2 = MI.getOperand(4);
6084 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6085 ? AMDGPU::S_ADDC_U32
6086 : AMDGPU::S_SUBB_U32;
6087 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6088 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6089 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6090 .addReg(Src0.getReg());
6091 Src0.setReg(RegOp0);
6092 }
6093 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6094 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6095 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6096 .addReg(Src1.getReg());
6097 Src1.setReg(RegOp1);
6098 }
6099 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6100 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6101 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6102 .addReg(Src2.getReg());
6103 Src2.setReg(RegOp2);
6104 }
6105
6106 if (ST.isWave64()) {
6107 if (ST.hasScalarCompareEq64()) {
6108 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6109 .addReg(Src2.getReg())
6110 .addImm(0);
6111 } else {
6112 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6113 const TargetRegisterClass *SubRC =
6114 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6115 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6116 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6117 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6118 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6119 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6120
6121 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6122 .add(Src2Sub0)
6123 .add(Src2Sub1);
6124
6125 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6126 .addReg(Src2_32, RegState::Kill)
6127 .addImm(0);
6128 }
6129 } else {
6130 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6131 .addReg(Src2.getReg())
6132 .addImm(0);
6133 }
6134
6135 // clang-format off
6136 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6137 .add(Src0)
6138 .add(Src1);
6139 // clang-format on
6140
6141 unsigned SelOpc =
6142 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6143
6144 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6145 .addImm(-1)
6146 .addImm(0);
6147
6148 MI.eraseFromParent();
6149 return BB;
6150 }
6151 case AMDGPU::SI_INIT_M0: {
6152 MachineOperand &M0Init = MI.getOperand(0);
6153 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6154 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6155 AMDGPU::M0)
6156 .add(M0Init);
6157 MI.eraseFromParent();
6158 return BB;
6159 }
6160 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6161 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6162 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6163 TII->get(AMDGPU::S_CMP_EQ_U32))
6164 .addImm(0)
6165 .addImm(0);
6166 return BB;
6167 }
6168 case AMDGPU::GET_GROUPSTATICSIZE: {
6169 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6170 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6171 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6172 .add(MI.getOperand(0))
6173 .addImm(MFI->getLDSSize());
6174 MI.eraseFromParent();
6175 return BB;
6176 }
6177 case AMDGPU::GET_SHADERCYCLESHILO: {
6179 // The algorithm is:
6180 //
6181 // hi1 = getreg(SHADER_CYCLES_HI)
6182 // lo1 = getreg(SHADER_CYCLES_LO)
6183 // hi2 = getreg(SHADER_CYCLES_HI)
6184 //
6185 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6186 // Otherwise there was overflow and the result is hi2:0. In both cases the
6187 // result should represent the actual time at some point during the sequence
6188 // of three getregs.
6189 using namespace AMDGPU::Hwreg;
6190 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6191 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6192 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6193 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6194 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6195 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6196 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6197 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6198 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6200 .addReg(RegHi1)
6201 .addReg(RegHi2);
6202 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6203 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6204 .addReg(RegLo1)
6205 .addImm(0);
6206 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6207 .add(MI.getOperand(0))
6208 .addReg(RegLo)
6209 .addImm(AMDGPU::sub0)
6210 .addReg(RegHi2)
6211 .addImm(AMDGPU::sub1);
6212 MI.eraseFromParent();
6213 return BB;
6214 }
6215 case AMDGPU::SI_INDIRECT_SRC_V1:
6216 case AMDGPU::SI_INDIRECT_SRC_V2:
6217 case AMDGPU::SI_INDIRECT_SRC_V4:
6218 case AMDGPU::SI_INDIRECT_SRC_V8:
6219 case AMDGPU::SI_INDIRECT_SRC_V9:
6220 case AMDGPU::SI_INDIRECT_SRC_V10:
6221 case AMDGPU::SI_INDIRECT_SRC_V11:
6222 case AMDGPU::SI_INDIRECT_SRC_V12:
6223 case AMDGPU::SI_INDIRECT_SRC_V16:
6224 case AMDGPU::SI_INDIRECT_SRC_V32:
6225 return emitIndirectSrc(MI, *BB, *getSubtarget());
6226 case AMDGPU::SI_INDIRECT_DST_V1:
6227 case AMDGPU::SI_INDIRECT_DST_V2:
6228 case AMDGPU::SI_INDIRECT_DST_V4:
6229 case AMDGPU::SI_INDIRECT_DST_V8:
6230 case AMDGPU::SI_INDIRECT_DST_V9:
6231 case AMDGPU::SI_INDIRECT_DST_V10:
6232 case AMDGPU::SI_INDIRECT_DST_V11:
6233 case AMDGPU::SI_INDIRECT_DST_V12:
6234 case AMDGPU::SI_INDIRECT_DST_V16:
6235 case AMDGPU::SI_INDIRECT_DST_V32:
6236 return emitIndirectDst(MI, *BB, *getSubtarget());
6237 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6238 case AMDGPU::SI_KILL_I1_PSEUDO:
6239 return splitKillBlock(MI, BB);
6240 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6241 Register Dst = MI.getOperand(0).getReg();
6242 const MachineOperand &Src0 = MI.getOperand(1);
6243 const MachineOperand &Src1 = MI.getOperand(2);
6244 Register SrcCond = MI.getOperand(3).getReg();
6245
6246 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6247 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6248 const auto *CondRC = TRI->getWaveMaskRegClass();
6249 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6250
6251 const TargetRegisterClass *Src0RC = Src0.isReg()
6252 ? MRI.getRegClass(Src0.getReg())
6253 : &AMDGPU::VReg_64RegClass;
6254 const TargetRegisterClass *Src1RC = Src1.isReg()
6255 ? MRI.getRegClass(Src1.getReg())
6256 : &AMDGPU::VReg_64RegClass;
6257
6258 const TargetRegisterClass *Src0SubRC =
6259 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6260 const TargetRegisterClass *Src1SubRC =
6261 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6262
6263 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6264 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6265 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6266 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6267
6268 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6269 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6270 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6271 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6272
6273 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6274 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6275 .addImm(0)
6276 .add(Src0Sub0)
6277 .addImm(0)
6278 .add(Src1Sub0)
6279 .addReg(SrcCondCopy);
6280 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6281 .addImm(0)
6282 .add(Src0Sub1)
6283 .addImm(0)
6284 .add(Src1Sub1)
6285 .addReg(SrcCondCopy);
6286
6287 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6288 .addReg(DstLo)
6289 .addImm(AMDGPU::sub0)
6290 .addReg(DstHi)
6291 .addImm(AMDGPU::sub1);
6292 MI.eraseFromParent();
6293 return BB;
6294 }
6295 case AMDGPU::SI_BR_UNDEF: {
6296 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6297 .add(MI.getOperand(0));
6298 Br->getOperand(1).setIsUndef(); // read undef SCC
6299 MI.eraseFromParent();
6300 return BB;
6301 }
6302 case AMDGPU::ADJCALLSTACKUP:
6303 case AMDGPU::ADJCALLSTACKDOWN: {
6305 MachineInstrBuilder MIB(*MF, &MI);
6306 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6307 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6308 return BB;
6309 }
6310 case AMDGPU::SI_CALL_ISEL: {
6311 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6312
6314 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6315
6316 for (const MachineOperand &MO : MI.operands())
6317 MIB.add(MO);
6318
6319 MIB.cloneMemRefs(MI);
6320 MI.eraseFromParent();
6321 return BB;
6322 }
6323 case AMDGPU::V_ADD_CO_U32_e32:
6324 case AMDGPU::V_SUB_CO_U32_e32:
6325 case AMDGPU::V_SUBREV_CO_U32_e32: {
6326 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6327 unsigned Opc = MI.getOpcode();
6328
6329 bool NeedClampOperand = false;
6330 if (TII->pseudoToMCOpcode(Opc) == -1) {
6332 NeedClampOperand = true;
6333 }
6334
6335 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6336 if (TII->isVOP3(*I)) {
6337 I.addReg(TRI->getVCC(), RegState::Define);
6338 }
6339 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6340 if (NeedClampOperand)
6341 I.addImm(0); // clamp bit for e64 encoding
6342
6343 TII->legalizeOperands(*I);
6344
6345 MI.eraseFromParent();
6346 return BB;
6347 }
6348 case AMDGPU::V_ADDC_U32_e32:
6349 case AMDGPU::V_SUBB_U32_e32:
6350 case AMDGPU::V_SUBBREV_U32_e32:
6351 // These instructions have an implicit use of vcc which counts towards the
6352 // constant bus limit.
6353 TII->legalizeOperands(MI);
6354 return BB;
6355 case AMDGPU::DS_GWS_INIT:
6356 case AMDGPU::DS_GWS_SEMA_BR:
6357 case AMDGPU::DS_GWS_BARRIER:
6358 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6359 [[fallthrough]];
6360 case AMDGPU::DS_GWS_SEMA_V:
6361 case AMDGPU::DS_GWS_SEMA_P:
6362 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6363 // A s_waitcnt 0 is required to be the instruction immediately following.
6364 if (getSubtarget()->hasGWSAutoReplay()) {
6366 return BB;
6367 }
6368
6369 return emitGWSMemViolTestLoop(MI, BB);
6370 case AMDGPU::S_SETREG_B32: {
6371 // Try to optimize cases that only set the denormal mode or rounding mode.
6372 //
6373 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6374 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6375 // instead.
6376 //
6377 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6378 // allow you to have a no side effect instruction in the output of a
6379 // sideeffecting pattern.
6380 auto [ID, Offset, Width] =
6381 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6383 return BB;
6384
6385 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6386 const unsigned SetMask = WidthMask << Offset;
6387
6388 if (getSubtarget()->hasDenormModeInst()) {
6389 unsigned SetDenormOp = 0;
6390 unsigned SetRoundOp = 0;
6391
6392 // The dedicated instructions can only set the whole denorm or round mode
6393 // at once, not a subset of bits in either.
6394 if (SetMask ==
6396 // If this fully sets both the round and denorm mode, emit the two
6397 // dedicated instructions for these.
6398 SetRoundOp = AMDGPU::S_ROUND_MODE;
6399 SetDenormOp = AMDGPU::S_DENORM_MODE;
6400 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6401 SetRoundOp = AMDGPU::S_ROUND_MODE;
6402 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6403 SetDenormOp = AMDGPU::S_DENORM_MODE;
6404 }
6405
6406 if (SetRoundOp || SetDenormOp) {
6407 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6408 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6409 unsigned ImmVal = Def->getOperand(1).getImm();
6410 if (SetRoundOp) {
6411 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6412 .addImm(ImmVal & 0xf);
6413
6414 // If we also have the denorm mode, get just the denorm mode bits.
6415 ImmVal >>= 4;
6416 }
6417
6418 if (SetDenormOp) {
6419 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6420 .addImm(ImmVal & 0xf);
6421 }
6422
6423 MI.eraseFromParent();
6424 return BB;
6425 }
6426 }
6427 }
6428
6429 // If only FP bits are touched, used the no side effects pseudo.
6430 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6431 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6432 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6433
6434 return BB;
6435 }
6436 case AMDGPU::S_INVERSE_BALLOT_U32:
6437 case AMDGPU::S_INVERSE_BALLOT_U64:
6438 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6439 // necessary. After that they are equivalent to a COPY.
6440 MI.setDesc(TII->get(AMDGPU::COPY));
6441 return BB;
6442 case AMDGPU::ENDPGM_TRAP: {
6443 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6444 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6445 MI.addOperand(MachineOperand::CreateImm(0));
6446 return BB;
6447 }
6448
6449 // We need a block split to make the real endpgm a terminator. We also don't
6450 // want to break phis in successor blocks, so we can't just delete to the
6451 // end of the block.
6452
6453 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6455 MF->push_back(TrapBB);
6456 // clang-format off
6457 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6458 .addImm(0);
6459 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6460 .addMBB(TrapBB);
6461 // clang-format on
6462
6463 BB->addSuccessor(TrapBB);
6464 MI.eraseFromParent();
6465 return SplitBB;
6466 }
6467 case AMDGPU::SIMULATED_TRAP: {
6468 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6469 MachineBasicBlock *SplitBB =
6470 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6471 MI.eraseFromParent();
6472 return SplitBB;
6473 }
6474 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6475 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6477
6478 // During ISel, it's difficult to propagate the original EXEC mask to use as
6479 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6480 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6481 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6482 Register OriginalExec = Setup->getOperand(0).getReg();
6483 MF->getRegInfo().clearKillFlags(OriginalExec);
6484 MI.getOperand(0).setReg(OriginalExec);
6485 return BB;
6486 }
6487 default:
6488 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6489 if (!MI.mayStore())
6491 return BB;
6492 }
6494 }
6495}
6496
6498 // This currently forces unfolding various combinations of fsub into fma with
6499 // free fneg'd operands. As long as we have fast FMA (controlled by
6500 // isFMAFasterThanFMulAndFAdd), we should perform these.
6501
6502 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6503 // most of these combines appear to be cycle neutral but save on instruction
6504 // count / code size.
6505 return true;
6506}
6507
6509
6511 EVT VT) const {
6512 if (!VT.isVector()) {
6513 return MVT::i1;
6514 }
6515 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6516}
6517
6519 // TODO: Should i16 be used always if legal? For now it would force VALU
6520 // shifts.
6521 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6522}
6523
6525 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6526 ? Ty.changeElementSize(16)
6527 : Ty.changeElementSize(32);
6528}
6529
6530// Answering this is somewhat tricky and depends on the specific device which
6531// have different rates for fma or all f64 operations.
6532//
6533// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6534// regardless of which device (although the number of cycles differs between
6535// devices), so it is always profitable for f64.
6536//
6537// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6538// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6539// which we can always do even without fused FP ops since it returns the same
6540// result as the separate operations and since it is always full
6541// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6542// however does not support denormals, so we do report fma as faster if we have
6543// a fast fma device and require denormals.
6544//
6546 EVT VT) const {
6547 VT = VT.getScalarType();
6548
6549 switch (VT.getSimpleVT().SimpleTy) {
6550 case MVT::f32: {
6551 // If mad is not available this depends only on if f32 fma is full rate.
6552 if (!Subtarget->hasMadMacF32Insts())
6553 return Subtarget->hasFastFMAF32();
6554
6555 // Otherwise f32 mad is always full rate and returns the same result as
6556 // the separate operations so should be preferred over fma.
6557 // However does not support denormals.
6559 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6560
6561 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6562 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6563 }
6564 case MVT::f64:
6565 return true;
6566 case MVT::f16:
6567 case MVT::bf16:
6568 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6569 default:
6570 break;
6571 }
6572
6573 return false;
6574}
6575
6577 LLT Ty) const {
6578 switch (Ty.getScalarSizeInBits()) {
6579 case 16:
6580 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6581 case 32:
6582 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6583 case 64:
6584 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6585 default:
6586 break;
6587 }
6588
6589 return false;
6590}
6591
6593 if (!Ty.isScalar())
6594 return false;
6595
6596 if (Ty.getScalarSizeInBits() == 16)
6597 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6598 if (Ty.getScalarSizeInBits() == 32)
6599 return Subtarget->hasMadMacF32Insts() &&
6600 denormalModeIsFlushAllF32(*MI.getMF());
6601
6602 return false;
6603}
6604
6606 const SDNode *N) const {
6607 // TODO: Check future ftz flag
6608 // v_mad_f32/v_mac_f32 do not support denormals.
6609 EVT VT = N->getValueType(0);
6610 if (VT == MVT::f32)
6611 return Subtarget->hasMadMacF32Insts() &&
6613 if (VT == MVT::f16) {
6614 return Subtarget->hasMadF16() &&
6616 }
6617
6618 return false;
6619}
6620
6621//===----------------------------------------------------------------------===//
6622// Custom DAG Lowering Operations
6623//===----------------------------------------------------------------------===//
6624
6625// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6626// wider vector type is legal.
6628 SelectionDAG &DAG) const {
6629 unsigned Opc = Op.getOpcode();
6630 EVT VT = Op.getValueType();
6631 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6632 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6633 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6634 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6635 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6636 VT == MVT::v32bf16);
6637
6638 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6639
6640 SDLoc SL(Op);
6641 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6642 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6643
6644 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6645}
6646
6647// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6648// regression whereby extra unnecessary instructions were added to codegen
6649// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6650// instructions to extract the result from the vector.
6652 [[maybe_unused]] EVT VT = Op.getValueType();
6653
6654 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6655 VT == MVT::v16i32) &&
6656 "Unexpected ValueType.");
6657
6658 return DAG.UnrollVectorOp(Op.getNode());
6659}
6660
6661// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6662// wider vector type is legal.
6664 SelectionDAG &DAG) const {
6665 unsigned Opc = Op.getOpcode();
6666 EVT VT = Op.getValueType();
6667 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6668 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6669 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6670 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6671 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6672 VT == MVT::v32bf16);
6673
6674 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6675 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6676
6677 SDLoc SL(Op);
6678
6679 SDValue OpLo =
6680 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6681 SDValue OpHi =
6682 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6683
6684 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6685}
6686
6688 SelectionDAG &DAG) const {
6689 unsigned Opc = Op.getOpcode();
6690 EVT VT = Op.getValueType();
6691 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6692 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6693 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6694 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6695 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6696 VT == MVT::v32bf16);
6697
6698 SDValue Op0 = Op.getOperand(0);
6699 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6700 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6701 : std::pair(Op0, Op0);
6702
6703 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6704 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6705
6706 SDLoc SL(Op);
6707 auto ResVT = DAG.GetSplitDestVTs(VT);
6708
6709 SDValue OpLo =
6710 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6711 SDValue OpHi =
6712 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6713
6714 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6715}
6716
6718 switch (Op.getOpcode()) {
6719 default:
6721 case ISD::BRCOND:
6722 return LowerBRCOND(Op, DAG);
6723 case ISD::RETURNADDR:
6724 return LowerRETURNADDR(Op, DAG);
6725 case ISD::LOAD: {
6726 SDValue Result = LowerLOAD(Op, DAG);
6727 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6728 "Load should return a value and a chain");
6729 return Result;
6730 }
6731 case ISD::FSQRT: {
6732 EVT VT = Op.getValueType();
6733 if (VT == MVT::f32)
6734 return lowerFSQRTF32(Op, DAG);
6735 if (VT == MVT::f64)
6736 return lowerFSQRTF64(Op, DAG);
6737 return SDValue();
6738 }
6739 case ISD::FSIN:
6740 case ISD::FCOS:
6741 return LowerTrig(Op, DAG);
6742 case ISD::SELECT:
6743 return LowerSELECT(Op, DAG);
6744 case ISD::FDIV:
6745 return LowerFDIV(Op, DAG);
6746 case ISD::FFREXP:
6747 return LowerFFREXP(Op, DAG);
6748 case ISD::ATOMIC_CMP_SWAP:
6749 return LowerATOMIC_CMP_SWAP(Op, DAG);
6750 case ISD::STORE:
6751 return LowerSTORE(Op, DAG);
6752 case ISD::GlobalAddress: {
6755 return LowerGlobalAddress(MFI, Op, DAG);
6756 }
6758 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6760 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6762 return LowerINTRINSIC_VOID(Op, DAG);
6763 case ISD::ADDRSPACECAST:
6764 return lowerADDRSPACECAST(Op, DAG);
6766 return lowerINSERT_SUBVECTOR(Op, DAG);
6768 return lowerINSERT_VECTOR_ELT(Op, DAG);
6770 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6772 return lowerVECTOR_SHUFFLE(Op, DAG);
6774 return lowerSCALAR_TO_VECTOR(Op, DAG);
6775 case ISD::BUILD_VECTOR:
6776 return lowerBUILD_VECTOR(Op, DAG);
6777 case ISD::FP_ROUND:
6779 return lowerFP_ROUND(Op, DAG);
6780 case ISD::TRAP:
6781 return lowerTRAP(Op, DAG);
6782 case ISD::DEBUGTRAP:
6783 return lowerDEBUGTRAP(Op, DAG);
6784 case ISD::ABS:
6785 case ISD::FABS:
6786 case ISD::FNEG:
6787 case ISD::FCANONICALIZE:
6788 case ISD::BSWAP:
6789 return splitUnaryVectorOp(Op, DAG);
6790 case ISD::FMINNUM:
6791 case ISD::FMAXNUM:
6792 return lowerFMINNUM_FMAXNUM(Op, DAG);
6793 case ISD::FMINIMUMNUM:
6794 case ISD::FMAXIMUMNUM:
6795 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6796 case ISD::FMINIMUM:
6797 case ISD::FMAXIMUM:
6798 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6799 case ISD::FLDEXP:
6800 case ISD::STRICT_FLDEXP:
6801 return lowerFLDEXP(Op, DAG);
6802 case ISD::FMA:
6803 return splitTernaryVectorOp(Op, DAG);
6804 case ISD::FP_TO_SINT:
6805 case ISD::FP_TO_UINT:
6806 return LowerFP_TO_INT(Op, DAG);
6807 case ISD::SHL:
6808 case ISD::SRA:
6809 case ISD::SRL:
6810 case ISD::ADD:
6811 case ISD::SUB:
6812 case ISD::SMIN:
6813 case ISD::SMAX:
6814 case ISD::UMIN:
6815 case ISD::UMAX:
6816 case ISD::FADD:
6817 case ISD::FMUL:
6818 case ISD::FMINNUM_IEEE:
6819 case ISD::FMAXNUM_IEEE:
6820 case ISD::UADDSAT:
6821 case ISD::USUBSAT:
6822 case ISD::SADDSAT:
6823 case ISD::SSUBSAT:
6824 return splitBinaryVectorOp(Op, DAG);
6825 case ISD::FCOPYSIGN:
6826 return lowerFCOPYSIGN(Op, DAG);
6827 case ISD::MUL:
6828 return lowerMUL(Op, DAG);
6829 case ISD::SMULO:
6830 case ISD::UMULO:
6831 return lowerXMULO(Op, DAG);
6832 case ISD::SMUL_LOHI:
6833 case ISD::UMUL_LOHI:
6834 return lowerXMUL_LOHI(Op, DAG);
6835 case ISD::DYNAMIC_STACKALLOC:
6836 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6837 case ISD::STACKSAVE:
6838 return LowerSTACKSAVE(Op, DAG);
6839 case ISD::GET_ROUNDING:
6840 return lowerGET_ROUNDING(Op, DAG);
6841 case ISD::SET_ROUNDING:
6842 return lowerSET_ROUNDING(Op, DAG);
6843 case ISD::PREFETCH:
6844 return lowerPREFETCH(Op, DAG);
6845 case ISD::FP_EXTEND:
6847 return lowerFP_EXTEND(Op, DAG);
6848 case ISD::GET_FPENV:
6849 return lowerGET_FPENV(Op, DAG);
6850 case ISD::SET_FPENV:
6851 return lowerSET_FPENV(Op, DAG);
6852 case ISD::ROTR:
6853 return lowerROTR(Op, DAG);
6854 }
6855 return SDValue();
6856}
6857
6858// Used for D16: Casts the result of an instruction into the right vector,
6859// packs values if loads return unpacked values.
6861 const SDLoc &DL, SelectionDAG &DAG,
6862 bool Unpacked) {
6863 if (!LoadVT.isVector())
6864 return Result;
6865
6866 // Cast back to the original packed type or to a larger type that is a
6867 // multiple of 32 bit for D16. Widening the return type is a required for
6868 // legalization.
6869 EVT FittingLoadVT = LoadVT;
6870 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6871 FittingLoadVT =
6873 LoadVT.getVectorNumElements() + 1);
6874 }
6875
6876 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6877 // Truncate to v2i16/v4i16.
6878 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6879
6880 // Workaround legalizer not scalarizing truncate after vector op
6881 // legalization but not creating intermediate vector trunc.
6883 DAG.ExtractVectorElements(Result, Elts);
6884 for (SDValue &Elt : Elts)
6885 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6886
6887 // Pad illegal v1i16/v3fi6 to v4i16
6888 if ((LoadVT.getVectorNumElements() % 2) == 1)
6889 Elts.push_back(DAG.getPOISON(MVT::i16));
6890
6891 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6892
6893 // Bitcast to original type (v2f16/v4f16).
6894 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6895 }
6896
6897 // Cast back to the original packed type.
6898 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6899}
6900
6901SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6902 SelectionDAG &DAG,
6904 bool IsIntrinsic) const {
6905 SDLoc DL(M);
6906
6907 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6908 EVT LoadVT = M->getValueType(0);
6909
6910 EVT EquivLoadVT = LoadVT;
6911 if (LoadVT.isVector()) {
6912 if (Unpacked) {
6913 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6914 LoadVT.getVectorNumElements());
6915 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6916 // Widen v3f16 to legal type
6917 EquivLoadVT =
6919 LoadVT.getVectorNumElements() + 1);
6920 }
6921 }
6922
6923 // Change from v4f16/v2f16 to EquivLoadVT.
6924 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6925
6927 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6928 M->getMemoryVT(), M->getMemOperand());
6929
6930 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6931
6932 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6933}
6934
6935SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6936 SelectionDAG &DAG,
6937 ArrayRef<SDValue> Ops) const {
6938 SDLoc DL(M);
6939 EVT LoadVT = M->getValueType(0);
6940 EVT EltType = LoadVT.getScalarType();
6941 EVT IntVT = LoadVT.changeTypeToInteger();
6942
6943 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6944
6945 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6946 bool IsTFE = M->getNumValues() == 3;
6947
6948 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6950 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6951 : AMDGPUISD::BUFFER_LOAD;
6952
6953 if (IsD16) {
6954 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6955 }
6956
6957 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6958 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6959 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6960 IsTFE);
6961
6962 if (isTypeLegal(LoadVT)) {
6963 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6964 M->getMemOperand(), DAG);
6965 }
6966
6967 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6968 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6969 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6970 M->getMemOperand(), DAG);
6971 return DAG.getMergeValues(
6972 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6973 DL);
6974}
6975
6977 SelectionDAG &DAG) {
6978 EVT VT = N->getValueType(0);
6979 unsigned CondCode = N->getConstantOperandVal(3);
6980 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6981 return DAG.getPOISON(VT);
6982
6983 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6984
6985 SDValue LHS = N->getOperand(1);
6986 SDValue RHS = N->getOperand(2);
6987
6988 SDLoc DL(N);
6989
6990 EVT CmpVT = LHS.getValueType();
6991 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6992 unsigned PromoteOp =
6994 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6995 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6996 }
6997
6998 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6999
7000 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7001 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7002
7003 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7004 DAG.getCondCode(CCOpcode));
7005 if (VT.bitsEq(CCVT))
7006 return SetCC;
7007 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7008}
7009
7011 SelectionDAG &DAG) {
7012 EVT VT = N->getValueType(0);
7013
7014 unsigned CondCode = N->getConstantOperandVal(3);
7015 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7016 return DAG.getPOISON(VT);
7017
7018 SDValue Src0 = N->getOperand(1);
7019 SDValue Src1 = N->getOperand(2);
7020 EVT CmpVT = Src0.getValueType();
7021 SDLoc SL(N);
7022
7023 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7024 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7025 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7026 }
7027
7028 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7029 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7030 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7031 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7032 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7033 DAG.getCondCode(CCOpcode));
7034 if (VT.bitsEq(CCVT))
7035 return SetCC;
7036 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7037}
7038
7040 SelectionDAG &DAG) {
7041 EVT VT = N->getValueType(0);
7042 SDValue Src = N->getOperand(1);
7043 SDLoc SL(N);
7044
7045 if (Src.getOpcode() == ISD::SETCC) {
7046 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7047 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7048 Src.getOperand(1), Src.getOperand(2));
7049 }
7050 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7051 // (ballot 0) -> 0
7052 if (Arg->isZero())
7053 return DAG.getConstant(0, SL, VT);
7054
7055 // (ballot 1) -> EXEC/EXEC_LO
7056 if (Arg->isOne()) {
7057 Register Exec;
7058 if (VT.getScalarSizeInBits() == 32)
7059 Exec = AMDGPU::EXEC_LO;
7060 else if (VT.getScalarSizeInBits() == 64)
7061 Exec = AMDGPU::EXEC;
7062 else
7063 return SDValue();
7064
7065 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7066 }
7067 }
7068
7069 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7070 // ISD::SETNE)
7071 return DAG.getNode(
7072 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7073 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7074}
7075
7077 SelectionDAG &DAG) {
7078 EVT VT = N->getValueType(0);
7079 unsigned ValSize = VT.getSizeInBits();
7080 unsigned IID = N->getConstantOperandVal(0);
7081 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7082 IID == Intrinsic::amdgcn_permlanex16;
7083 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7084 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7085 SDLoc SL(N);
7086 MVT IntVT = MVT::getIntegerVT(ValSize);
7087 const GCNSubtarget *ST = TLI.getSubtarget();
7088 unsigned SplitSize = 32;
7089 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7090 ST->hasDPALU_DPP() &&
7091 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7092 SplitSize = 64;
7093
7094 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7095 SDValue Src2, MVT ValT) -> SDValue {
7097 switch (IID) {
7098 case Intrinsic::amdgcn_permlane16:
7099 case Intrinsic::amdgcn_permlanex16:
7100 case Intrinsic::amdgcn_update_dpp:
7101 Operands.push_back(N->getOperand(6));
7102 Operands.push_back(N->getOperand(5));
7103 Operands.push_back(N->getOperand(4));
7104 [[fallthrough]];
7105 case Intrinsic::amdgcn_writelane:
7106 Operands.push_back(Src2);
7107 [[fallthrough]];
7108 case Intrinsic::amdgcn_readlane:
7109 case Intrinsic::amdgcn_set_inactive:
7110 case Intrinsic::amdgcn_set_inactive_chain_arg:
7111 case Intrinsic::amdgcn_mov_dpp8:
7112 Operands.push_back(Src1);
7113 [[fallthrough]];
7114 case Intrinsic::amdgcn_readfirstlane:
7115 case Intrinsic::amdgcn_permlane64:
7116 Operands.push_back(Src0);
7117 break;
7118 default:
7119 llvm_unreachable("unhandled lane op");
7120 }
7121
7122 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7123 std::reverse(Operands.begin(), Operands.end());
7124
7125 if (SDNode *GL = N->getGluedNode()) {
7126 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7127 GL = GL->getOperand(0).getNode();
7128 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7129 SDValue(GL, 0)));
7130 }
7131
7132 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7133 };
7134
7135 SDValue Src0 = N->getOperand(1);
7136 SDValue Src1, Src2;
7137 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7138 IID == Intrinsic::amdgcn_mov_dpp8 ||
7139 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7140 Src1 = N->getOperand(2);
7141 if (IID == Intrinsic::amdgcn_writelane ||
7142 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7143 Src2 = N->getOperand(3);
7144 }
7145
7146 if (ValSize == SplitSize) {
7147 // Already legal
7148 return SDValue();
7149 }
7150
7151 if (ValSize < 32) {
7152 bool IsFloat = VT.isFloatingPoint();
7153 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7154 SL, MVT::i32);
7155
7156 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7157 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7158 SL, MVT::i32);
7159 }
7160
7161 if (IID == Intrinsic::amdgcn_writelane) {
7162 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7163 SL, MVT::i32);
7164 }
7165
7166 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7167 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7168 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7169 }
7170
7171 if (ValSize % SplitSize != 0)
7172 return SDValue();
7173
7174 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7175 EVT VT = N->getValueType(0);
7176 unsigned NE = VT.getVectorNumElements();
7177 EVT EltVT = VT.getVectorElementType();
7179 unsigned NumOperands = N->getNumOperands();
7180 SmallVector<SDValue, 4> Operands(NumOperands);
7181 SDNode *GL = N->getGluedNode();
7182
7183 // only handle convergencectrl_glue
7184 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7185
7186 for (unsigned i = 0; i != NE; ++i) {
7187 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7188 ++j) {
7189 SDValue Operand = N->getOperand(j);
7190 EVT OperandVT = Operand.getValueType();
7191 if (OperandVT.isVector()) {
7192 // A vector operand; extract a single element.
7193 EVT OperandEltVT = OperandVT.getVectorElementType();
7194 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7195 Operand, DAG.getVectorIdxConstant(i, SL));
7196 } else {
7197 // A scalar operand; just use it as is.
7198 Operands[j] = Operand;
7199 }
7200 }
7201
7202 if (GL)
7203 Operands[NumOperands - 1] =
7204 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7205 SDValue(GL->getOperand(0).getNode(), 0));
7206
7207 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7208 }
7209
7210 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7211 return DAG.getBuildVector(VecVT, SL, Scalars);
7212 };
7213
7214 if (VT.isVector()) {
7215 switch (MVT::SimpleValueType EltTy =
7217 case MVT::i32:
7218 case MVT::f32:
7219 if (SplitSize == 32) {
7220 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7221 return unrollLaneOp(LaneOp.getNode());
7222 }
7223 [[fallthrough]];
7224 case MVT::i16:
7225 case MVT::f16:
7226 case MVT::bf16: {
7227 unsigned SubVecNumElt =
7228 SplitSize / VT.getVectorElementType().getSizeInBits();
7229 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7231 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7232 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7233 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7234 DAG.getConstant(EltIdx, SL, MVT::i32));
7235
7236 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7237 IsPermLane16)
7238 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7239 DAG.getConstant(EltIdx, SL, MVT::i32));
7240
7241 if (IID == Intrinsic::amdgcn_writelane)
7242 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7243 DAG.getConstant(EltIdx, SL, MVT::i32));
7244
7245 Pieces.push_back(
7246 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7247 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7248 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7249 EltIdx += SubVecNumElt;
7250 }
7251 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7252 }
7253 default:
7254 // Handle all other cases by bitcasting to i32 vectors
7255 break;
7256 }
7257 }
7258
7259 MVT VecVT =
7260 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7261 Src0 = DAG.getBitcast(VecVT, Src0);
7262
7263 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7264 Src1 = DAG.getBitcast(VecVT, Src1);
7265
7266 if (IID == Intrinsic::amdgcn_writelane)
7267 Src2 = DAG.getBitcast(VecVT, Src2);
7268
7269 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7270 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7271 return DAG.getBitcast(VT, UnrolledLaneOp);
7272}
7273
7276 SelectionDAG &DAG) const {
7277 switch (N->getOpcode()) {
7279 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7280 Results.push_back(Res);
7281 return;
7282 }
7284 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7285 Results.push_back(Res);
7286 return;
7287 }
7289 unsigned IID = N->getConstantOperandVal(0);
7290 switch (IID) {
7291 case Intrinsic::amdgcn_make_buffer_rsrc:
7292 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7293 return;
7294 case Intrinsic::amdgcn_cvt_pkrtz: {
7295 SDValue Src0 = N->getOperand(1);
7296 SDValue Src1 = N->getOperand(2);
7297 SDLoc SL(N);
7298 SDValue Cvt =
7299 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7300 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7301 return;
7302 }
7303 case Intrinsic::amdgcn_cvt_pknorm_i16:
7304 case Intrinsic::amdgcn_cvt_pknorm_u16:
7305 case Intrinsic::amdgcn_cvt_pk_i16:
7306 case Intrinsic::amdgcn_cvt_pk_u16: {
7307 SDValue Src0 = N->getOperand(1);
7308 SDValue Src1 = N->getOperand(2);
7309 SDLoc SL(N);
7310 unsigned Opcode;
7311
7312 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7314 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7316 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7318 else
7320
7321 EVT VT = N->getValueType(0);
7322 if (isTypeLegal(VT))
7323 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7324 else {
7325 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7326 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7327 }
7328 return;
7329 }
7330 case Intrinsic::amdgcn_s_buffer_load: {
7331 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7332 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7333 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7334 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7335 // s_buffer_load_i8.
7336 if (!Subtarget->hasScalarSubwordLoads())
7337 return;
7338 SDValue Op = SDValue(N, 0);
7339 SDValue Rsrc = Op.getOperand(1);
7340 SDValue Offset = Op.getOperand(2);
7341 SDValue CachePolicy = Op.getOperand(3);
7342 EVT VT = Op.getValueType();
7343 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7344 SDLoc DL(Op);
7346 const DataLayout &DataLayout = DAG.getDataLayout();
7347 Align Alignment =
7353 VT.getStoreSize(), Alignment);
7354 SDValue LoadVal;
7355 if (!Offset->isDivergent()) {
7356 SDValue Ops[] = {Rsrc, // source register
7357 Offset, CachePolicy};
7358 SDValue BufferLoad =
7360 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7361 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7362 } else {
7363 SDValue Ops[] = {
7364 DAG.getEntryNode(), // Chain
7365 Rsrc, // rsrc
7366 DAG.getConstant(0, DL, MVT::i32), // vindex
7367 {}, // voffset
7368 {}, // soffset
7369 {}, // offset
7370 CachePolicy, // cachepolicy
7371 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7372 };
7373 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7374 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7375 }
7376 Results.push_back(LoadVal);
7377 return;
7378 }
7379 case Intrinsic::amdgcn_dead: {
7380 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7381 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7382 return;
7383 }
7384 }
7385 break;
7386 }
7388 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7389 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7390 // FIXME: Hacky
7391 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7392 Results.push_back(Res.getOperand(I));
7393 }
7394 } else {
7395 Results.push_back(Res);
7396 Results.push_back(Res.getValue(1));
7397 }
7398 return;
7399 }
7400
7401 break;
7402 }
7403 case ISD::SELECT: {
7404 SDLoc SL(N);
7405 EVT VT = N->getValueType(0);
7406 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7407 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7408 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7409
7410 EVT SelectVT = NewVT;
7411 if (NewVT.bitsLT(MVT::i32)) {
7412 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7413 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7414 SelectVT = MVT::i32;
7415 }
7416
7417 SDValue NewSelect =
7418 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7419
7420 if (NewVT != SelectVT)
7421 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7422 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7423 return;
7424 }
7425 case ISD::FNEG: {
7426 if (N->getValueType(0) != MVT::v2f16)
7427 break;
7428
7429 SDLoc SL(N);
7430 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7431
7432 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7433 DAG.getConstant(0x80008000, SL, MVT::i32));
7434 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7435 return;
7436 }
7437 case ISD::FABS: {
7438 if (N->getValueType(0) != MVT::v2f16)
7439 break;
7440
7441 SDLoc SL(N);
7442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7443
7444 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7445 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7446 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7447 return;
7448 }
7449 case ISD::FSQRT: {
7450 if (N->getValueType(0) != MVT::f16)
7451 break;
7452 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7453 break;
7454 }
7455 default:
7457 break;
7458 }
7459}
7460
7461/// Helper function for LowerBRCOND
7462static SDNode *findUser(SDValue Value, unsigned Opcode) {
7463
7464 for (SDUse &U : Value->uses()) {
7465 if (U.get() != Value)
7466 continue;
7467
7468 if (U.getUser()->getOpcode() == Opcode)
7469 return U.getUser();
7470 }
7471 return nullptr;
7472}
7473
7474unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7475 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7476 switch (Intr->getConstantOperandVal(1)) {
7477 case Intrinsic::amdgcn_if:
7478 return AMDGPUISD::IF;
7479 case Intrinsic::amdgcn_else:
7480 return AMDGPUISD::ELSE;
7481 case Intrinsic::amdgcn_loop:
7482 return AMDGPUISD::LOOP;
7483 case Intrinsic::amdgcn_end_cf:
7484 llvm_unreachable("should not occur");
7485 default:
7486 return 0;
7487 }
7488 }
7489
7490 // break, if_break, else_break are all only used as inputs to loop, not
7491 // directly as branch conditions.
7492 return 0;
7493}
7494
7501
7503 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7504 return false;
7505
7506 // FIXME: Either avoid relying on address space here or change the default
7507 // address space for functions to avoid the explicit check.
7508 return (GV->getValueType()->isFunctionTy() ||
7511}
7512
7514 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7515}
7516
7518 if (!GV->hasExternalLinkage())
7519 return true;
7520
7521 const auto OS = getTargetMachine().getTargetTriple().getOS();
7522 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7523}
7524
7525/// This transforms the control flow intrinsics to get the branch destination as
7526/// last parameter, also switches branch target with BR if the need arise
7527SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7528 SDLoc DL(BRCOND);
7529
7530 SDNode *Intr = BRCOND.getOperand(1).getNode();
7531 SDValue Target = BRCOND.getOperand(2);
7532 SDNode *BR = nullptr;
7533 SDNode *SetCC = nullptr;
7534
7535 switch (Intr->getOpcode()) {
7536 case ISD::SETCC: {
7537 // As long as we negate the condition everything is fine
7538 SetCC = Intr;
7539 Intr = SetCC->getOperand(0).getNode();
7540 break;
7541 }
7542 case ISD::XOR: {
7543 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7544 SDValue LHS = Intr->getOperand(0);
7545 SDValue RHS = Intr->getOperand(1);
7546 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7547 Intr = LHS.getNode();
7548 break;
7549 }
7550 [[fallthrough]];
7551 }
7552 default: {
7553 // Get the target from BR if we don't negate the condition
7554 BR = findUser(BRCOND, ISD::BR);
7555 assert(BR && "brcond missing unconditional branch user");
7556 Target = BR->getOperand(1);
7557 }
7558 }
7559
7560 unsigned CFNode = isCFIntrinsic(Intr);
7561 if (CFNode == 0) {
7562 // This is a uniform branch so we don't need to legalize.
7563 return BRCOND;
7564 }
7565
7566 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7568
7569 assert(!SetCC ||
7570 (SetCC->getConstantOperandVal(1) == 1 &&
7571 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7572 ISD::SETNE));
7573
7574 // operands of the new intrinsic call
7576 if (HaveChain)
7577 Ops.push_back(BRCOND.getOperand(0));
7578
7579 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7580 Ops.push_back(Target);
7581
7582 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7583
7584 // build the new intrinsic call
7585 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7586
7587 if (!HaveChain) {
7588 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7589
7591 }
7592
7593 if (BR) {
7594 // Give the branch instruction our target
7595 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7596 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7597 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7598 }
7599
7600 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7601
7602 // Copy the intrinsic results to registers
7603 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7604 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7605 if (!CopyToReg)
7606 continue;
7607
7608 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7609 SDValue(Result, i - 1), SDValue());
7610
7611 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7612 }
7613
7614 // Remove the old intrinsic from the chain
7615 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7616 Intr->getOperand(0));
7617
7618 return Chain;
7619}
7620
7621SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7622 MVT VT = Op.getSimpleValueType();
7623 SDLoc DL(Op);
7624 // Checking the depth
7625 if (Op.getConstantOperandVal(0) != 0)
7626 return DAG.getConstant(0, DL, VT);
7627
7628 MachineFunction &MF = DAG.getMachineFunction();
7629 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7630 // Check for kernel and shader functions
7631 if (Info->isEntryFunction())
7632 return DAG.getConstant(0, DL, VT);
7633
7634 MachineFrameInfo &MFI = MF.getFrameInfo();
7635 // There is a call to @llvm.returnaddress in this function
7636 MFI.setReturnAddressIsTaken(true);
7637
7638 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7639 // Get the return address reg and mark it as an implicit live-in
7640 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7641 getRegClassFor(VT, Op.getNode()->isDivergent()));
7642
7643 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7644}
7645
7646SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7647 const SDLoc &DL, EVT VT) const {
7648 return Op.getValueType().bitsLE(VT)
7649 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7650 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7651 DAG.getTargetConstant(0, DL, MVT::i32));
7652}
7653
7654SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7655 SelectionDAG &DAG) const {
7656 EVT DstVT = Op.getValueType();
7657 unsigned NumElts = DstVT.getVectorNumElements();
7658 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7659
7660 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7661
7662 SDLoc DL(Op);
7663 unsigned Opc = Op.getOpcode();
7664 SDValue Flags = Op.getOperand(1);
7665 EVT HalfDstVT =
7666 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7667 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7668 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7669
7670 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7671}
7672
7673SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7674 SDValue Src = Op.getOperand(0);
7675 EVT SrcVT = Src.getValueType();
7676 EVT DstVT = Op.getValueType();
7677
7678 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7679 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7680 if (SrcVT.getScalarType() != MVT::f32)
7681 return SDValue();
7682 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7683 }
7684
7685 if (SrcVT.getScalarType() != MVT::f64)
7686 return Op;
7687
7688 SDLoc DL(Op);
7689 if (DstVT == MVT::f16) {
7690 // TODO: Handle strictfp
7691 if (Op.getOpcode() != ISD::FP_ROUND)
7692 return Op;
7693
7694 if (!Subtarget->has16BitInsts()) {
7695 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7696 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7697 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7698 }
7699 if (Op->getFlags().hasApproximateFuncs()) {
7700 SDValue Flags = Op.getOperand(1);
7701 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7702 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7703 }
7704 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7705 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7706 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7707 }
7708
7709 assert(DstVT.getScalarType() == MVT::bf16 &&
7710 "custom lower FP_ROUND for f16 or bf16");
7711 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7712
7713 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7714 // hardware f32 -> bf16 instruction.
7715 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7716 MVT::f32;
7717 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7718 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7719 DAG.getTargetConstant(0, DL, MVT::i32));
7720}
7721
7722SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7723 SelectionDAG &DAG) const {
7724 EVT VT = Op.getValueType();
7725 const MachineFunction &MF = DAG.getMachineFunction();
7726 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7727 bool IsIEEEMode = Info->getMode().IEEE;
7728
7729 // FIXME: Assert during selection that this is only selected for
7730 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7731 // mode functions, but this happens to be OK since it's only done in cases
7732 // where there is known no sNaN.
7733 if (IsIEEEMode)
7734 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7735
7736 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7737 VT == MVT::v16bf16)
7738 return splitBinaryVectorOp(Op, DAG);
7739 return Op;
7740}
7741
7742SDValue
7743SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7744 SelectionDAG &DAG) const {
7745 EVT VT = Op.getValueType();
7746 const MachineFunction &MF = DAG.getMachineFunction();
7747 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7748 bool IsIEEEMode = Info->getMode().IEEE;
7749
7750 if (IsIEEEMode)
7751 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7752
7753 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7754 VT == MVT::v16bf16)
7755 return splitBinaryVectorOp(Op, DAG);
7756 return Op;
7757}
7758
7759SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7760 SelectionDAG &DAG) const {
7761 EVT VT = Op.getValueType();
7762 if (VT.isVector())
7763 return splitBinaryVectorOp(Op, DAG);
7764
7765 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7766 !Subtarget->hasMinimum3Maximum3F16() &&
7767 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7768 "should not need to widen f16 minimum/maximum to v2f16");
7769
7770 // Widen f16 operation to v2f16
7771
7772 // fminimum f16:x, f16:y ->
7773 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7774 // (v2f16 (scalar_to_vector y))), 0
7775 SDLoc SL(Op);
7776 SDValue WideSrc0 =
7777 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7778 SDValue WideSrc1 =
7779 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7780
7781 SDValue Widened =
7782 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7783
7784 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7785 DAG.getConstant(0, SL, MVT::i32));
7786}
7787
7788SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7789 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7790 EVT VT = Op.getValueType();
7791 assert(VT == MVT::f16);
7792
7793 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7794 EVT ExpVT = Exp.getValueType();
7795 if (ExpVT == MVT::i16)
7796 return Op;
7797
7798 SDLoc DL(Op);
7799
7800 // Correct the exponent type for f16 to i16.
7801 // Clamp the range of the exponent to the instruction's range.
7802
7803 // TODO: This should be a generic narrowing legalization, and can easily be
7804 // for GlobalISel.
7805
7806 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7807 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7808
7809 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7810 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7811
7812 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7813
7814 if (IsStrict) {
7815 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7816 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7817 }
7818
7819 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7820}
7821
7823 switch (Op->getOpcode()) {
7824 case ISD::SRA:
7825 case ISD::SMIN:
7826 case ISD::SMAX:
7827 return ISD::SIGN_EXTEND;
7828 case ISD::SRL:
7829 case ISD::UMIN:
7830 case ISD::UMAX:
7831 return ISD::ZERO_EXTEND;
7832 case ISD::ADD:
7833 case ISD::SUB:
7834 case ISD::AND:
7835 case ISD::OR:
7836 case ISD::XOR:
7837 case ISD::SHL:
7838 case ISD::SELECT:
7839 case ISD::MUL:
7840 // operation result won't be influenced by garbage high bits.
7841 // TODO: are all of those cases correct, and are there more?
7842 return ISD::ANY_EXTEND;
7843 case ISD::SETCC: {
7844 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7846 }
7847 default:
7848 llvm_unreachable("unexpected opcode!");
7849 }
7850}
7851
7852SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7853 DAGCombinerInfo &DCI) const {
7854 const unsigned Opc = Op.getOpcode();
7855 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7856 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7857 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7858 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7859 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7860
7861 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7862 : Op->getOperand(0).getValueType();
7863 auto ExtTy = OpTy.changeElementType(MVT::i32);
7864
7865 if (DCI.isBeforeLegalizeOps() ||
7866 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7867 return SDValue();
7868
7869 auto &DAG = DCI.DAG;
7870
7871 SDLoc DL(Op);
7872 SDValue LHS;
7873 SDValue RHS;
7874 if (Opc == ISD::SELECT) {
7875 LHS = Op->getOperand(1);
7876 RHS = Op->getOperand(2);
7877 } else {
7878 LHS = Op->getOperand(0);
7879 RHS = Op->getOperand(1);
7880 }
7881
7882 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7883 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7884
7885 // Special case: for shifts, the RHS always needs a zext.
7886 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7887 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7888 else
7889 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7890
7891 // setcc always return i1/i1 vec so no need to truncate after.
7892 if (Opc == ISD::SETCC) {
7893 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7894 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7895 }
7896
7897 // For other ops, we extend the operation's return type as well so we need to
7898 // truncate back to the original type.
7899 SDValue NewVal;
7900 if (Opc == ISD::SELECT)
7901 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7902 else
7903 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7904
7905 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7906}
7907
7908SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7909 SDValue Mag = Op.getOperand(0);
7910 EVT MagVT = Mag.getValueType();
7911
7912 if (MagVT.getVectorNumElements() > 2)
7913 return splitBinaryVectorOp(Op, DAG);
7914
7915 SDValue Sign = Op.getOperand(1);
7916 EVT SignVT = Sign.getValueType();
7917
7918 if (MagVT == SignVT)
7919 return Op;
7920
7921 // fcopysign v2f16:mag, v2f32:sign ->
7922 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7923
7924 SDLoc SL(Op);
7925 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7926 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7927
7928 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7929
7930 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7931}
7932
7933// Custom lowering for vector multiplications and s_mul_u64.
7934SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7935 EVT VT = Op.getValueType();
7936
7937 // Split vector operands.
7938 if (VT.isVector())
7939 return splitBinaryVectorOp(Op, DAG);
7940
7941 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7942
7943 // There are four ways to lower s_mul_u64:
7944 //
7945 // 1. If all the operands are uniform, then we lower it as it is.
7946 //
7947 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7948 // multiplications because there is not a vector equivalent of s_mul_u64.
7949 //
7950 // 3. If the cost model decides that it is more efficient to use vector
7951 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7952 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7953 //
7954 // 4. If the cost model decides to use vector registers and both of the
7955 // operands are zero-extended/sign-extended from 32-bits, then we split the
7956 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7957 // possible to check if the operands are zero-extended or sign-extended in
7958 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7959 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7960 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7961 // If the cost model decides that we have to use vector registers, then
7962 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7963 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7964 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7965 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7966 // SIInstrInfo.cpp .
7967
7968 if (Op->isDivergent())
7969 return SDValue();
7970
7971 SDValue Op0 = Op.getOperand(0);
7972 SDValue Op1 = Op.getOperand(1);
7973 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7974 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7975 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7976 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7977 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7978 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7979 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7980 SDLoc SL(Op);
7981 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7982 return SDValue(
7983 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7984 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7985 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7986 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7987 return SDValue(
7988 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7989 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7990 return Op;
7991}
7992
7993SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7994 EVT VT = Op.getValueType();
7995 SDLoc SL(Op);
7996 SDValue LHS = Op.getOperand(0);
7997 SDValue RHS = Op.getOperand(1);
7998 bool isSigned = Op.getOpcode() == ISD::SMULO;
7999
8000 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8001 const APInt &C = RHSC->getAPIntValue();
8002 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8003 if (C.isPowerOf2()) {
8004 // smulo(x, signed_min) is same as umulo(x, signed_min).
8005 bool UseArithShift = isSigned && !C.isMinSignedValue();
8006 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8007 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8008 SDValue Overflow =
8009 DAG.getSetCC(SL, MVT::i1,
8010 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8011 Result, ShiftAmt),
8012 LHS, ISD::SETNE);
8013 return DAG.getMergeValues({Result, Overflow}, SL);
8014 }
8015 }
8016
8017 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8018 SDValue Top =
8019 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8020
8021 SDValue Sign = isSigned
8022 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8023 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8024 SL, MVT::i32))
8025 : DAG.getConstant(0, SL, VT);
8026 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8027
8028 return DAG.getMergeValues({Result, Overflow}, SL);
8029}
8030
8031SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8032 if (Op->isDivergent()) {
8033 // Select to V_MAD_[IU]64_[IU]32.
8034 return Op;
8035 }
8036 if (Subtarget->hasSMulHi()) {
8037 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8038 return SDValue();
8039 }
8040 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8041 // calculate the high part, so we might as well do the whole thing with
8042 // V_MAD_[IU]64_[IU]32.
8043 return Op;
8044}
8045
8046SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8047 if (!Subtarget->isTrapHandlerEnabled() ||
8048 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8049 return lowerTrapEndpgm(Op, DAG);
8050
8051 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8052 : lowerTrapHsaQueuePtr(Op, DAG);
8053}
8054
8055SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8056 SDLoc SL(Op);
8057 SDValue Chain = Op.getOperand(0);
8058 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8059}
8060
8061SDValue
8062SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8063 const SDLoc &DL, Align Alignment,
8064 ImplicitParameter Param) const {
8065 MachineFunction &MF = DAG.getMachineFunction();
8066 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8067 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8068 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8069 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8072}
8073
8074SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8075 SelectionDAG &DAG) const {
8076 SDLoc SL(Op);
8077 SDValue Chain = Op.getOperand(0);
8078
8079 SDValue QueuePtr;
8080 // For code object version 5, QueuePtr is passed through implicit kernarg.
8081 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8083 QueuePtr =
8084 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8085 } else {
8086 MachineFunction &MF = DAG.getMachineFunction();
8087 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8088 Register UserSGPR = Info->getQueuePtrUserSGPR();
8089
8090 if (UserSGPR == AMDGPU::NoRegister) {
8091 // We probably are in a function incorrectly marked with
8092 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8093 // trap, so just use a null pointer.
8094 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8095 } else {
8096 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8097 MVT::i64);
8098 }
8099 }
8100
8101 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8102 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8103
8104 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8105 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8106 ToReg.getValue(1)};
8107 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8108}
8109
8110SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8111 SDLoc SL(Op);
8112 SDValue Chain = Op.getOperand(0);
8113
8114 // We need to simulate the 's_trap 2' instruction on targets that run in
8115 // PRIV=1 (where it is treated as a nop).
8116 if (Subtarget->hasPrivEnabledTrap2NopBug())
8117 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8118
8119 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8120 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8121 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8122}
8123
8124SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8125 SDLoc SL(Op);
8126 SDValue Chain = Op.getOperand(0);
8127 MachineFunction &MF = DAG.getMachineFunction();
8128
8129 if (!Subtarget->isTrapHandlerEnabled() ||
8130 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8131 LLVMContext &Ctx = MF.getFunction().getContext();
8132 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8133 "debugtrap handler not supported",
8134 Op.getDebugLoc(), DS_Warning));
8135 return Chain;
8136 }
8137
8138 uint64_t TrapID =
8139 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8140 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8141 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8142}
8143
8144SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8145 SelectionDAG &DAG) const {
8146 if (Subtarget->hasApertureRegs()) {
8147 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8148 ? AMDGPU::SRC_SHARED_BASE
8149 : AMDGPU::SRC_PRIVATE_BASE;
8150 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8151 !Subtarget->hasGloballyAddressableScratch()) &&
8152 "Cannot use src_private_base with globally addressable scratch!");
8153 // Note: this feature (register) is broken. When used as a 32-bit operand,
8154 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8155 // bits.
8156 //
8157 // To work around the issue, emit a 64 bit copy from this register
8158 // then extract the high bits. Note that this shouldn't even result in a
8159 // shift being emitted and simply become a pair of registers (e.g.):
8160 // s_mov_b64 s[6:7], src_shared_base
8161 // v_mov_b32_e32 v1, s7
8162 SDValue Copy =
8163 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8164 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8165 }
8166
8167 // For code object version 5, private_base and shared_base are passed through
8168 // implicit kernargs.
8169 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8173 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8174 }
8175
8176 MachineFunction &MF = DAG.getMachineFunction();
8177 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8178 Register UserSGPR = Info->getQueuePtrUserSGPR();
8179 if (UserSGPR == AMDGPU::NoRegister) {
8180 // We probably are in a function incorrectly marked with
8181 // amdgpu-no-queue-ptr. This is undefined.
8182 return DAG.getPOISON(MVT::i32);
8183 }
8184
8185 SDValue QueuePtr =
8186 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8187
8188 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8189 // private_segment_aperture_base_hi.
8190 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8191
8192 SDValue Ptr =
8193 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8194
8195 // TODO: Use custom target PseudoSourceValue.
8196 // TODO: We should use the value from the IR intrinsic call, but it might not
8197 // be available and how do we get it?
8198 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8199 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8200 commonAlignment(Align(64), StructOffset),
8203}
8204
8205/// Return true if the value is a known valid address, such that a null check is
8206/// not necessary.
8208 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8210 return true;
8211
8212 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8213 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8214
8215 // TODO: Search through arithmetic, handle arguments and loads
8216 // marked nonnull.
8217 return false;
8218}
8219
8220SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8221 SelectionDAG &DAG) const {
8222 SDLoc SL(Op);
8223
8224 const AMDGPUTargetMachine &TM =
8225 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8226
8227 unsigned DestAS, SrcAS;
8228 SDValue Src;
8229 bool IsNonNull = false;
8230 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8231 SrcAS = ASC->getSrcAddressSpace();
8232 Src = ASC->getOperand(0);
8233 DestAS = ASC->getDestAddressSpace();
8234 } else {
8235 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8236 Op.getConstantOperandVal(0) ==
8237 Intrinsic::amdgcn_addrspacecast_nonnull);
8238 Src = Op->getOperand(1);
8239 SrcAS = Op->getConstantOperandVal(2);
8240 DestAS = Op->getConstantOperandVal(3);
8241 IsNonNull = true;
8242 }
8243
8244 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8245
8246 // flat -> local/private
8247 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8248 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8249 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8250 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8251
8252 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8253 Subtarget->hasGloballyAddressableScratch()) {
8254 // flat -> private with globally addressable scratch: subtract
8255 // src_flat_scratch_base_lo.
8256 SDValue FlatScratchBaseLo(
8257 DAG.getMachineNode(
8258 AMDGPU::S_MOV_B32, SL, MVT::i32,
8259 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8260 0);
8261 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8262 }
8263
8264 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8265 return Ptr;
8266
8267 unsigned NullVal = TM.getNullPointerValue(DestAS);
8268 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8269 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8270
8271 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8272 SegmentNullPtr);
8273 }
8274 }
8275
8276 // local/private -> flat
8277 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8278 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8279 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8280 SDValue CvtPtr;
8281 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8282 Subtarget->hasGloballyAddressableScratch()) {
8283 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8284 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8285 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8286 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8287 ThreadID = DAG.getNode(
8288 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8289 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8290 AllOnes, ThreadID);
8291 if (Subtarget->isWave64())
8292 ThreadID = DAG.getNode(
8293 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8294 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8295 AllOnes, ThreadID);
8296 SDValue ShAmt = DAG.getShiftAmountConstant(
8297 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8298 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8299 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8300 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8301 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8302 // 64-bit hi:lo value.
8303 SDValue FlatScratchBase = {
8304 DAG.getMachineNode(
8305 AMDGPU::S_MOV_B64, SL, MVT::i64,
8306 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8307 0};
8308 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8309 } else {
8310 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8311 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8312 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8313 }
8314
8315 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8316 return CvtPtr;
8317
8318 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8319 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8320
8321 SDValue NonNull =
8322 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8323
8324 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8325 FlatNullPtr);
8326 }
8327 }
8328
8329 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8330 Op.getValueType() == MVT::i64) {
8331 const SIMachineFunctionInfo *Info =
8332 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8333 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8334 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8335 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8336 }
8337
8338 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8339 Src.getValueType() == MVT::i64)
8340 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8341
8342 // global <-> flat are no-ops and never emitted.
8343
8344 // Invalid casts are poison.
8345 return DAG.getPOISON(Op->getValueType(0));
8346}
8347
8348// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8349// the small vector and inserting them into the big vector. That is better than
8350// the default expansion of doing it via a stack slot. Even though the use of
8351// the stack slot would be optimized away afterwards, the stack slot itself
8352// remains.
8353SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8354 SelectionDAG &DAG) const {
8355 SDValue Vec = Op.getOperand(0);
8356 SDValue Ins = Op.getOperand(1);
8357 SDValue Idx = Op.getOperand(2);
8358 EVT VecVT = Vec.getValueType();
8359 EVT InsVT = Ins.getValueType();
8360 EVT EltVT = VecVT.getVectorElementType();
8361 unsigned InsNumElts = InsVT.getVectorNumElements();
8362 unsigned IdxVal = Idx->getAsZExtVal();
8363 SDLoc SL(Op);
8364
8365 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8366 // Insert 32-bit registers at a time.
8367 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8368
8369 unsigned VecNumElts = VecVT.getVectorNumElements();
8370 EVT NewVecVT =
8371 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8372 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8374 MVT::i32, InsNumElts / 2);
8375
8376 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8377 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8378
8379 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8380 SDValue Elt;
8381 if (InsNumElts == 2) {
8382 Elt = Ins;
8383 } else {
8384 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8385 DAG.getConstant(I, SL, MVT::i32));
8386 }
8387 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8388 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8389 }
8390
8391 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8392 }
8393
8394 for (unsigned I = 0; I != InsNumElts; ++I) {
8395 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8396 DAG.getConstant(I, SL, MVT::i32));
8397 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8398 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8399 }
8400 return Vec;
8401}
8402
8403SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8404 SelectionDAG &DAG) const {
8405 SDValue Vec = Op.getOperand(0);
8406 SDValue InsVal = Op.getOperand(1);
8407 SDValue Idx = Op.getOperand(2);
8408 EVT VecVT = Vec.getValueType();
8409 EVT EltVT = VecVT.getVectorElementType();
8410 unsigned VecSize = VecVT.getSizeInBits();
8411 unsigned EltSize = EltVT.getSizeInBits();
8412 SDLoc SL(Op);
8413
8414 // Specially handle the case of v4i16 with static indexing.
8415 unsigned NumElts = VecVT.getVectorNumElements();
8416 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8417 if (NumElts == 4 && EltSize == 16 && KIdx) {
8418 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8419
8420 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8421 DAG.getConstant(0, SL, MVT::i32));
8422 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8423 DAG.getConstant(1, SL, MVT::i32));
8424
8425 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8426 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8427
8428 unsigned Idx = KIdx->getZExtValue();
8429 bool InsertLo = Idx < 2;
8430 SDValue InsHalf = DAG.getNode(
8431 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8432 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8433 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8434
8435 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8436
8437 SDValue Concat =
8438 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8439 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8440
8441 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8442 }
8443
8444 // Static indexing does not lower to stack access, and hence there is no need
8445 // for special custom lowering to avoid stack access.
8446 if (isa<ConstantSDNode>(Idx))
8447 return SDValue();
8448
8449 // Avoid stack access for dynamic indexing by custom lowering to
8450 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8451
8452 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8453
8454 MVT IntVT = MVT::getIntegerVT(VecSize);
8455
8456 // Convert vector index to bit-index and get the required bit mask.
8457 assert(isPowerOf2_32(EltSize));
8458 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8459 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8460 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8461 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8462 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8463
8464 // 1. Create a congruent vector with the target value in each element.
8465 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8466 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8467
8468 // 2. Mask off all other indices except the required index within (1).
8469 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8470
8471 // 3. Mask off the required index within the target vector.
8472 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8473 SDValue RHS =
8474 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8475
8476 // 4. Get (2) and (3) ORed into the target vector.
8477 SDValue BFI =
8478 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8479
8480 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8481}
8482
8483SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8484 SelectionDAG &DAG) const {
8485 SDLoc SL(Op);
8486
8487 EVT ResultVT = Op.getValueType();
8488 SDValue Vec = Op.getOperand(0);
8489 SDValue Idx = Op.getOperand(1);
8490 EVT VecVT = Vec.getValueType();
8491 unsigned VecSize = VecVT.getSizeInBits();
8492 EVT EltVT = VecVT.getVectorElementType();
8493
8494 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8495
8496 // Make sure we do any optimizations that will make it easier to fold
8497 // source modifiers before obscuring it with bit operations.
8498
8499 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8500 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8501 return Combined;
8502
8503 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8504 SDValue Lo, Hi;
8505 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8506
8507 if (VecSize == 128) {
8508 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8509 Lo = DAG.getBitcast(LoVT,
8510 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8511 DAG.getConstant(0, SL, MVT::i32)));
8512 Hi = DAG.getBitcast(HiVT,
8513 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8514 DAG.getConstant(1, SL, MVT::i32)));
8515 } else if (VecSize == 256) {
8516 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8517 SDValue Parts[4];
8518 for (unsigned P = 0; P < 4; ++P) {
8519 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8520 DAG.getConstant(P, SL, MVT::i32));
8521 }
8522
8523 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8524 Parts[0], Parts[1]));
8525 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8526 Parts[2], Parts[3]));
8527 } else {
8528 assert(VecSize == 512);
8529
8530 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8531 SDValue Parts[8];
8532 for (unsigned P = 0; P < 8; ++P) {
8533 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8534 DAG.getConstant(P, SL, MVT::i32));
8535 }
8536
8537 Lo = DAG.getBitcast(LoVT,
8538 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8539 Parts[0], Parts[1], Parts[2], Parts[3]));
8540 Hi = DAG.getBitcast(HiVT,
8541 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8542 Parts[4], Parts[5], Parts[6], Parts[7]));
8543 }
8544
8545 EVT IdxVT = Idx.getValueType();
8546 unsigned NElem = VecVT.getVectorNumElements();
8547 assert(isPowerOf2_32(NElem));
8548 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8549 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8550 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8551 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8552 }
8553
8554 assert(VecSize <= 64);
8555
8556 MVT IntVT = MVT::getIntegerVT(VecSize);
8557
8558 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8559 SDValue VecBC = peekThroughBitcasts(Vec);
8560 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8561 SDValue Src = VecBC.getOperand(0);
8562 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8563 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8564 }
8565
8566 unsigned EltSize = EltVT.getSizeInBits();
8567 assert(isPowerOf2_32(EltSize));
8568
8569 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8570
8571 // Convert vector index to bit-index (* EltSize)
8572 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8573
8574 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8575 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8576
8577 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8578 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8579 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8580 }
8581
8582 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8583}
8584
8585static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8586 assert(Elt % 2 == 0);
8587 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8588}
8589
8590static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8591 assert(Elt % 2 == 0);
8592 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8593 !(Mask[Elt + 1] & 1);
8594}
8595
8596SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8597 SelectionDAG &DAG) const {
8598 SDLoc SL(Op);
8599 EVT ResultVT = Op.getValueType();
8600 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8601 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8602 const int NewSrcNumElts = 2;
8603 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8604 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8605
8606 // Break up the shuffle into registers sized pieces.
8607 //
8608 // We're trying to form sub-shuffles that the register allocation pipeline
8609 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8610 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8611 // pair of copies into a consecutive register copy, so use the ordinary
8612 // extract_vector_elt lowering unless we can use the shuffle.
8613 //
8614 // TODO: This is a bit of hack, and we should probably always use
8615 // extract_subvector for the largest possible subvector we can (or at least
8616 // use it for PackVT aligned pieces). However we have worse support for
8617 // combines on them don't directly treat extract_subvector / insert_subvector
8618 // as legal. The DAG scheduler also ends up doing a worse job with the
8619 // extract_subvectors.
8620 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8621
8622 // vector_shuffle <0,1,6,7> lhs, rhs
8623 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8624 //
8625 // vector_shuffle <6,7,2,3> lhs, rhs
8626 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8627 //
8628 // vector_shuffle <6,7,0,1> lhs, rhs
8629 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8630
8631 // Avoid scalarizing when both halves are reading from consecutive elements.
8632
8633 // If we're treating 2 element shuffles as legal, also create odd-to-even
8634 // shuffles of neighboring pairs.
8635 //
8636 // vector_shuffle <3,2,7,6> lhs, rhs
8637 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8638 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8639
8641 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8642 if (ShouldUseConsecutiveExtract &&
8644 const int Idx = SVN->getMaskElt(I);
8645 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8646 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8647 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8648 SVN->getOperand(VecIdx),
8649 DAG.getConstant(EltIdx, SL, MVT::i32));
8650 Pieces.push_back(SubVec);
8651 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8653 int Idx0 = SVN->getMaskElt(I);
8654 int Idx1 = SVN->getMaskElt(I + 1);
8655
8656 SDValue SrcOp0 = SVN->getOperand(0);
8657 SDValue SrcOp1 = SrcOp0;
8658 if (Idx0 >= SrcNumElts) {
8659 SrcOp0 = SVN->getOperand(1);
8660 Idx0 -= SrcNumElts;
8661 }
8662
8663 if (Idx1 >= SrcNumElts) {
8664 SrcOp1 = SVN->getOperand(1);
8665 Idx1 -= SrcNumElts;
8666 }
8667
8668 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8669 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8670
8671 // Extract nearest even aligned piece.
8672 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8673 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8674 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8675 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8676
8677 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8678 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8679
8680 SDValue Result0 = SubVec0;
8681 SDValue Result1 = SubVec0;
8682
8683 if (SubVec0 != SubVec1) {
8684 NewMaskIdx1 += NewSrcNumElts;
8685 Result1 = SubVec1;
8686 } else {
8687 Result1 = DAG.getPOISON(PackVT);
8688 }
8689
8690 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8691 {NewMaskIdx0, NewMaskIdx1});
8692 Pieces.push_back(Shuf);
8693 } else {
8694 const int Idx0 = SVN->getMaskElt(I);
8695 const int Idx1 = SVN->getMaskElt(I + 1);
8696 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8697 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8698 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8699 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8700
8701 SDValue Vec0 = SVN->getOperand(VecIdx0);
8702 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8703 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8704
8705 SDValue Vec1 = SVN->getOperand(VecIdx1);
8706 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8707 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8708 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8709 }
8710 }
8711
8712 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8713}
8714
8715SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8716 SelectionDAG &DAG) const {
8717 SDValue SVal = Op.getOperand(0);
8718 EVT ResultVT = Op.getValueType();
8719 EVT SValVT = SVal.getValueType();
8720 SDValue UndefVal = DAG.getPOISON(SValVT);
8721 SDLoc SL(Op);
8722
8724 VElts.push_back(SVal);
8725 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8726 VElts.push_back(UndefVal);
8727
8728 return DAG.getBuildVector(ResultVT, SL, VElts);
8729}
8730
8731SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8732 SelectionDAG &DAG) const {
8733 SDLoc SL(Op);
8734 EVT VT = Op.getValueType();
8735
8736 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8737 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8738
8739 SDValue Lo = Op.getOperand(0);
8740 SDValue Hi = Op.getOperand(1);
8741
8742 // Avoid adding defined bits with the zero_extend.
8743 if (Hi.isUndef()) {
8744 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8745 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8746 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8747 }
8748
8749 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8750 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8751
8752 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8753 DAG.getConstant(16, SL, MVT::i32));
8754 if (Lo.isUndef())
8755 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8756
8757 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8758 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8759
8760 SDValue Or =
8761 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8762 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8763 }
8764
8765 // Split into 2-element chunks.
8766 const unsigned NumParts = VT.getVectorNumElements() / 2;
8767 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8768 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8769
8771 for (unsigned P = 0; P < NumParts; ++P) {
8772 SDValue Vec = DAG.getBuildVector(
8773 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8774 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8775 }
8776
8777 SDValue Blend =
8778 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8779 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8780}
8781
8783 const GlobalAddressSDNode *GA) const {
8784 // OSes that use ELF REL relocations (instead of RELA) can only store a
8785 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8786 // which can create arbitrary 64-bit addends. (This is only a problem for
8787 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8788 // the high 32 bits of the addend.)
8789 //
8790 // This should be kept in sync with how HasRelocationAddend is initialized in
8791 // the constructor of ELFAMDGPUAsmBackend.
8792 if (!Subtarget->isAmdHsaOS())
8793 return false;
8794
8795 // We can fold offsets for anything that doesn't require a GOT relocation.
8796 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8800}
8801
8802static SDValue
8804 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8805 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8806 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8807 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8808 // lowered to the following code sequence:
8809 //
8810 // For constant address space:
8811 // s_getpc_b64 s[0:1]
8812 // s_add_u32 s0, s0, $symbol
8813 // s_addc_u32 s1, s1, 0
8814 //
8815 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8816 // a fixup or relocation is emitted to replace $symbol with a literal
8817 // constant, which is a pc-relative offset from the encoding of the $symbol
8818 // operand to the global variable.
8819 //
8820 // For global address space:
8821 // s_getpc_b64 s[0:1]
8822 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8823 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8824 //
8825 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8826 // fixups or relocations are emitted to replace $symbol@*@lo and
8827 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8828 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8829 // operand to the global variable.
8830 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8831 assert(GAFlags != SIInstrInfo::MO_NONE);
8832
8833 SDValue Ptr =
8834 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8835 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8836 }
8837
8838 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8839 SDValue PtrHi;
8840 if (GAFlags == SIInstrInfo::MO_NONE)
8841 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8842 else
8843 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8844 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8845}
8846
8847SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8848 SDValue Op,
8849 SelectionDAG &DAG) const {
8850 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8851 SDLoc DL(GSD);
8852 EVT PtrVT = Op.getValueType();
8853
8854 const GlobalValue *GV = GSD->getGlobal();
8860 GV->hasExternalLinkage()) {
8861 Type *Ty = GV->getValueType();
8862 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8863 // zero-sized type in other languages to declare the dynamic shared
8864 // memory which size is not known at the compile time. They will be
8865 // allocated by the runtime and placed directly after the static
8866 // allocated ones. They all share the same offset.
8867 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8868 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8869 // Adjust alignment for that dynamic shared memory array.
8872 MFI->setUsesDynamicLDS(true);
8873 return SDValue(
8874 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8875 }
8876 }
8878 }
8879
8881 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8883 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8884 }
8885
8886 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8887 if (Subtarget->has64BitLiterals()) {
8889 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8890 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8891 0);
8892 }
8893
8894 SDValue AddrLo = DAG.getTargetGlobalAddress(
8895 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8896 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8897
8898 SDValue AddrHi = DAG.getTargetGlobalAddress(
8899 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8900 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8901
8902 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8903 }
8904
8905 if (shouldEmitFixup(GV))
8906 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8907
8908 if (shouldEmitPCReloc(GV))
8909 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8911
8912 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8914 PointerType *PtrTy =
8916 const DataLayout &DataLayout = DAG.getDataLayout();
8917 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8918 MachinePointerInfo PtrInfo =
8920
8921 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8924}
8925
8927 const SDLoc &DL, SDValue V) const {
8928 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8929 // the destination register.
8930 //
8931 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8932 // so we will end up with redundant moves to m0.
8933 //
8934 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8935
8936 // A Null SDValue creates a glue result.
8937 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8938 V, Chain);
8939 return SDValue(M0, 0);
8940}
8941
8942SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8943 MVT VT,
8944 unsigned Offset) const {
8945 SDLoc SL(Op);
8946 SDValue Param = lowerKernargMemParameter(
8947 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8948 // The local size values will have the hi 16-bits as zero.
8949 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8950 DAG.getValueType(VT));
8951}
8952
8954 EVT VT) {
8957 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8958 return DAG.getPOISON(VT);
8959}
8960
8962 EVT VT) {
8965 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8966 return DAG.getPOISON(VT);
8967}
8968
8970 ArrayRef<SDValue> Elts) {
8971 assert(!Elts.empty());
8972 MVT Type;
8973 unsigned NumElts = Elts.size();
8974
8975 if (NumElts <= 12) {
8976 Type = MVT::getVectorVT(MVT::f32, NumElts);
8977 } else {
8978 assert(Elts.size() <= 16);
8979 Type = MVT::v16f32;
8980 NumElts = 16;
8981 }
8982
8983 SmallVector<SDValue, 16> VecElts(NumElts);
8984 for (unsigned i = 0; i < Elts.size(); ++i) {
8985 SDValue Elt = Elts[i];
8986 if (Elt.getValueType() != MVT::f32)
8987 Elt = DAG.getBitcast(MVT::f32, Elt);
8988 VecElts[i] = Elt;
8989 }
8990 for (unsigned i = Elts.size(); i < NumElts; ++i)
8991 VecElts[i] = DAG.getPOISON(MVT::f32);
8992
8993 if (NumElts == 1)
8994 return VecElts[0];
8995 return DAG.getBuildVector(Type, DL, VecElts);
8996}
8997
8998static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8999 SDValue Src, int ExtraElts) {
9000 EVT SrcVT = Src.getValueType();
9001
9003
9004 if (SrcVT.isVector())
9005 DAG.ExtractVectorElements(Src, Elts);
9006 else
9007 Elts.push_back(Src);
9008
9009 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9010 while (ExtraElts--)
9011 Elts.push_back(Undef);
9012
9013 return DAG.getBuildVector(CastVT, DL, Elts);
9014}
9015
9016// Re-construct the required return value for a image load intrinsic.
9017// This is more complicated due to the optional use TexFailCtrl which means the
9018// required return type is an aggregate
9020 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9021 bool Unpacked, bool IsD16, int DMaskPop,
9022 int NumVDataDwords, bool IsAtomicPacked16Bit,
9023 const SDLoc &DL) {
9024 // Determine the required return type. This is the same regardless of
9025 // IsTexFail flag
9026 EVT ReqRetVT = ResultTypes[0];
9027 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9028 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9029 ? (ReqRetNumElts + 1) / 2
9030 : ReqRetNumElts;
9031
9032 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9033
9034 MVT DataDwordVT =
9035 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9036
9037 MVT MaskPopVT =
9038 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9039
9040 SDValue Data(Result, 0);
9041 SDValue TexFail;
9042
9043 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9044 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9045 if (MaskPopVT.isVector()) {
9046 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9047 SDValue(Result, 0), ZeroIdx);
9048 } else {
9049 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9050 SDValue(Result, 0), ZeroIdx);
9051 }
9052 }
9053
9054 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9055 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9056 NumDataDwords - MaskPopDwords);
9057
9058 if (IsD16)
9059 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9060
9061 EVT LegalReqRetVT = ReqRetVT;
9062 if (!ReqRetVT.isVector()) {
9063 if (!Data.getValueType().isInteger())
9064 Data = DAG.getNode(ISD::BITCAST, DL,
9065 Data.getValueType().changeTypeToInteger(), Data);
9066 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9067 } else {
9068 // We need to widen the return vector to a legal type
9069 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9070 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9071 LegalReqRetVT =
9073 ReqRetVT.getVectorNumElements() + 1);
9074 }
9075 }
9076 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9077
9078 if (IsTexFail) {
9079 TexFail =
9080 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9081 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9082
9083 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9084 }
9085
9086 if (Result->getNumValues() == 1)
9087 return Data;
9088
9089 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9090}
9091
9092static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9093 SDValue *LWE, bool &IsTexFail) {
9094 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9095
9096 uint64_t Value = TexFailCtrlConst->getZExtValue();
9097 if (Value) {
9098 IsTexFail = true;
9099 }
9100
9101 SDLoc DL(TexFailCtrlConst);
9102 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9103 Value &= ~(uint64_t)0x1;
9104 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9105 Value &= ~(uint64_t)0x2;
9106
9107 return Value == 0;
9108}
9109
9111 MVT PackVectorVT,
9112 SmallVectorImpl<SDValue> &PackedAddrs,
9113 unsigned DimIdx, unsigned EndIdx,
9114 unsigned NumGradients) {
9115 SDLoc DL(Op);
9116 for (unsigned I = DimIdx; I < EndIdx; I++) {
9117 SDValue Addr = Op.getOperand(I);
9118
9119 // Gradients are packed with undef for each coordinate.
9120 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9121 // 1D: undef,dx/dh; undef,dx/dv
9122 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9123 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9124 if (((I + 1) >= EndIdx) ||
9125 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9126 I == DimIdx + NumGradients - 1))) {
9127 if (Addr.getValueType() != MVT::i16)
9128 Addr = DAG.getBitcast(MVT::i16, Addr);
9129 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9130 } else {
9131 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9132 I++;
9133 }
9134 Addr = DAG.getBitcast(MVT::f32, Addr);
9135 PackedAddrs.push_back(Addr);
9136 }
9137}
9138
9139SDValue SITargetLowering::lowerImage(SDValue Op,
9141 SelectionDAG &DAG, bool WithChain) const {
9142 SDLoc DL(Op);
9143 MachineFunction &MF = DAG.getMachineFunction();
9144 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9145 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9147 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9148 unsigned IntrOpcode = Intr->BaseOpcode;
9149 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9150 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9151 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9152
9153 SmallVector<EVT, 3> ResultTypes(Op->values());
9154 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9155 bool IsD16 = false;
9156 bool IsG16 = false;
9157 bool IsA16 = false;
9158 SDValue VData;
9159 int NumVDataDwords = 0;
9160 bool AdjustRetType = false;
9161 bool IsAtomicPacked16Bit = false;
9162
9163 // Offset of intrinsic arguments
9164 const unsigned ArgOffset = WithChain ? 2 : 1;
9165
9166 unsigned DMask;
9167 unsigned DMaskLanes = 0;
9168
9169 if (BaseOpcode->Atomic) {
9170 VData = Op.getOperand(2);
9171
9172 IsAtomicPacked16Bit =
9173 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9174 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9175
9176 bool Is64Bit = VData.getValueSizeInBits() == 64;
9177 if (BaseOpcode->AtomicX2) {
9178 SDValue VData2 = Op.getOperand(3);
9179 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9180 {VData, VData2});
9181 if (Is64Bit)
9182 VData = DAG.getBitcast(MVT::v4i32, VData);
9183
9184 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9185 DMask = Is64Bit ? 0xf : 0x3;
9186 NumVDataDwords = Is64Bit ? 4 : 2;
9187 } else {
9188 DMask = Is64Bit ? 0x3 : 0x1;
9189 NumVDataDwords = Is64Bit ? 2 : 1;
9190 }
9191 } else {
9192 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9193 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9194
9195 if (BaseOpcode->Store) {
9196 VData = Op.getOperand(2);
9197
9198 MVT StoreVT = VData.getSimpleValueType();
9199 if (StoreVT.getScalarType() == MVT::f16) {
9200 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9201 return Op; // D16 is unsupported for this instruction
9202
9203 IsD16 = true;
9204 VData = handleD16VData(VData, DAG, true);
9205 }
9206
9207 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9208 } else if (!BaseOpcode->NoReturn) {
9209 // Work out the num dwords based on the dmask popcount and underlying type
9210 // and whether packing is supported.
9211 MVT LoadVT = ResultTypes[0].getSimpleVT();
9212 if (LoadVT.getScalarType() == MVT::f16) {
9213 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9214 return Op; // D16 is unsupported for this instruction
9215
9216 IsD16 = true;
9217 }
9218
9219 // Confirm that the return type is large enough for the dmask specified
9220 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9221 (!LoadVT.isVector() && DMaskLanes > 1))
9222 return Op;
9223
9224 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9225 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9226 // instructions.
9227 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9228 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9229 NumVDataDwords = (DMaskLanes + 1) / 2;
9230 else
9231 NumVDataDwords = DMaskLanes;
9232
9233 AdjustRetType = true;
9234 }
9235 }
9236
9237 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9239
9240 // Check for 16 bit addresses or derivatives and pack if true.
9241 MVT VAddrVT =
9242 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9243 MVT VAddrScalarVT = VAddrVT.getScalarType();
9244 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9245 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9246
9247 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9248 VAddrScalarVT = VAddrVT.getScalarType();
9249 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9250 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9251
9252 // Push back extra arguments.
9253 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9254 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9255 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9256 // Special handling of bias when A16 is on. Bias is of type half but
9257 // occupies full 32-bit.
9258 SDValue Bias = DAG.getBuildVector(
9259 MVT::v2f16, DL,
9260 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9261 VAddrs.push_back(Bias);
9262 } else {
9263 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9264 "Bias needs to be converted to 16 bit in A16 mode");
9265 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9266 }
9267 }
9268
9269 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9270 // 16 bit gradients are supported, but are tied to the A16 control
9271 // so both gradients and addresses must be 16 bit
9272 LLVM_DEBUG(
9273 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9274 "require 16 bit args for both gradients and addresses");
9275 return Op;
9276 }
9277
9278 if (IsA16) {
9279 if (!ST->hasA16()) {
9280 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9281 "support 16 bit addresses\n");
9282 return Op;
9283 }
9284 }
9285
9286 // We've dealt with incorrect input so we know that if IsA16, IsG16
9287 // are set then we have to compress/pack operands (either address,
9288 // gradient or both)
9289 // In the case where a16 and gradients are tied (no G16 support) then we
9290 // have already verified that both IsA16 and IsG16 are true
9291 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9292 // Activate g16
9293 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9295 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9296 }
9297
9298 // Add gradients (packed or unpacked)
9299 if (IsG16) {
9300 // Pack the gradients
9301 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9302 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9303 ArgOffset + Intr->GradientStart,
9304 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9305 } else {
9306 for (unsigned I = ArgOffset + Intr->GradientStart;
9307 I < ArgOffset + Intr->CoordStart; I++)
9308 VAddrs.push_back(Op.getOperand(I));
9309 }
9310
9311 // Add addresses (packed or unpacked)
9312 if (IsA16) {
9313 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9314 ArgOffset + Intr->CoordStart, VAddrEnd,
9315 0 /* No gradients */);
9316 } else {
9317 // Add uncompressed address
9318 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9319 VAddrs.push_back(Op.getOperand(I));
9320 }
9321
9322 // If the register allocator cannot place the address registers contiguously
9323 // without introducing moves, then using the non-sequential address encoding
9324 // is always preferable, since it saves VALU instructions and is usually a
9325 // wash in terms of code size or even better.
9326 //
9327 // However, we currently have no way of hinting to the register allocator that
9328 // MIMG addresses should be placed contiguously when it is possible to do so,
9329 // so force non-NSA for the common 2-address case as a heuristic.
9330 //
9331 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9332 // allocation when possible.
9333 //
9334 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9335 // set of the remaining addresses.
9336 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9337 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9338 const bool UseNSA = ST->hasNSAEncoding() &&
9339 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9340 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9341 const bool UsePartialNSA =
9342 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9343
9344 SDValue VAddr;
9345 if (UsePartialNSA) {
9346 VAddr = getBuildDwordsVector(DAG, DL,
9347 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9348 } else if (!UseNSA) {
9349 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9350 }
9351
9352 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9353 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9354 SDValue Unorm;
9355 if (!BaseOpcode->Sampler) {
9356 Unorm = True;
9357 } else {
9358 uint64_t UnormConst =
9359 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9360
9361 Unorm = UnormConst ? True : False;
9362 }
9363
9364 SDValue TFE;
9365 SDValue LWE;
9366 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9367 bool IsTexFail = false;
9368 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9369 return Op;
9370
9371 if (IsTexFail) {
9372 if (!DMaskLanes) {
9373 // Expecting to get an error flag since TFC is on - and dmask is 0
9374 // Force dmask to be at least 1 otherwise the instruction will fail
9375 DMask = 0x1;
9376 DMaskLanes = 1;
9377 NumVDataDwords = 1;
9378 }
9379 NumVDataDwords += 1;
9380 AdjustRetType = true;
9381 }
9382
9383 // Has something earlier tagged that the return type needs adjusting
9384 // This happens if the instruction is a load or has set TexFailCtrl flags
9385 if (AdjustRetType) {
9386 // NumVDataDwords reflects the true number of dwords required in the return
9387 // type
9388 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9389 // This is a no-op load. This can be eliminated
9390 SDValue Undef = DAG.getPOISON(Op.getValueType());
9391 if (isa<MemSDNode>(Op))
9392 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9393 return Undef;
9394 }
9395
9396 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9397 MVT::i32, NumVDataDwords)
9398 : MVT::i32;
9399
9400 ResultTypes[0] = NewVT;
9401 if (ResultTypes.size() == 3) {
9402 // Original result was aggregate type used for TexFailCtrl results
9403 // The actual instruction returns as a vector type which has now been
9404 // created. Remove the aggregate result.
9405 ResultTypes.erase(&ResultTypes[1]);
9406 }
9407 }
9408
9409 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9410 if (BaseOpcode->Atomic)
9411 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9412 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9414 return Op;
9415
9417 if (BaseOpcode->Store || BaseOpcode->Atomic)
9418 Ops.push_back(VData); // vdata
9419 if (UsePartialNSA) {
9420 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9421 Ops.push_back(VAddr);
9422 } else if (UseNSA)
9423 append_range(Ops, VAddrs);
9424 else
9425 Ops.push_back(VAddr);
9426 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9427 EVT RsrcVT = Rsrc.getValueType();
9428 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9429 return Op;
9430 Ops.push_back(Rsrc);
9431 if (BaseOpcode->Sampler) {
9432 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9433 if (Samp.getValueType() != MVT::v4i32)
9434 return Op;
9435 Ops.push_back(Samp);
9436 }
9437 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9438 if (IsGFX10Plus)
9439 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9440 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9441 Ops.push_back(Unorm);
9442 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9443 Ops.push_back(IsA16 && // r128, a16 for gfx9
9444 ST->hasFeature(AMDGPU::FeatureR128A16)
9445 ? True
9446 : False);
9447 if (IsGFX10Plus)
9448 Ops.push_back(IsA16 ? True : False);
9449
9450 if (!Subtarget->hasGFX90AInsts())
9451 Ops.push_back(TFE); // tfe
9452 else if (TFE->getAsZExtVal()) {
9453 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9455 "TFE is not supported on this GPU", DL.getDebugLoc()));
9456 }
9457
9458 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9459 Ops.push_back(LWE); // lwe
9460 if (!IsGFX10Plus)
9461 Ops.push_back(DimInfo->DA ? True : False);
9462 if (BaseOpcode->HasD16)
9463 Ops.push_back(IsD16 ? True : False);
9464 if (isa<MemSDNode>(Op))
9465 Ops.push_back(Op.getOperand(0)); // chain
9466
9467 int NumVAddrDwords =
9468 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9469 int Opcode = -1;
9470
9471 if (IsGFX12Plus) {
9472 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9473 NumVDataDwords, NumVAddrDwords);
9474 } else if (IsGFX11Plus) {
9475 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9476 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9477 : AMDGPU::MIMGEncGfx11Default,
9478 NumVDataDwords, NumVAddrDwords);
9479 } else if (IsGFX10Plus) {
9480 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9481 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9482 : AMDGPU::MIMGEncGfx10Default,
9483 NumVDataDwords, NumVAddrDwords);
9484 } else {
9485 if (Subtarget->hasGFX90AInsts()) {
9486 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9487 NumVDataDwords, NumVAddrDwords);
9488 if (Opcode == -1) {
9489 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9491 "requested image instruction is not supported on this GPU",
9492 DL.getDebugLoc()));
9493
9494 unsigned Idx = 0;
9495 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9496 for (EVT VT : OrigResultTypes) {
9497 if (VT == MVT::Other)
9498 RetValues[Idx++] = Op.getOperand(0); // Chain
9499 else
9500 RetValues[Idx++] = DAG.getPOISON(VT);
9501 }
9502
9503 return DAG.getMergeValues(RetValues, DL);
9504 }
9505 }
9506 if (Opcode == -1 &&
9507 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9508 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9509 NumVDataDwords, NumVAddrDwords);
9510 if (Opcode == -1)
9511 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9512 NumVDataDwords, NumVAddrDwords);
9513 }
9514 if (Opcode == -1)
9515 return Op;
9516
9517 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9518 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9519 MachineMemOperand *MemRef = MemOp->getMemOperand();
9520 DAG.setNodeMemRefs(NewNode, {MemRef});
9521 }
9522
9523 if (BaseOpcode->AtomicX2) {
9525 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9526 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9527 }
9528 if (BaseOpcode->NoReturn)
9529 return SDValue(NewNode, 0);
9530 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9531 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9532 NumVDataDwords, IsAtomicPacked16Bit, DL);
9533}
9534
9535SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9536 SDValue Offset, SDValue CachePolicy,
9537 SelectionDAG &DAG) const {
9538 MachineFunction &MF = DAG.getMachineFunction();
9539
9540 const DataLayout &DataLayout = DAG.getDataLayout();
9541 Align Alignment =
9542 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9543
9544 MachineMemOperand *MMO = MF.getMachineMemOperand(
9545 MachinePointerInfo(),
9548 VT.getStoreSize(), Alignment);
9549
9550 if (!Offset->isDivergent()) {
9551 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9552
9553 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9554 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9555 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9556 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9557 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9558 SDValue BufferLoad =
9560 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9561 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9562 }
9563
9564 // Widen vec3 load to vec4.
9565 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9566 !Subtarget->hasScalarDwordx3Loads()) {
9567 EVT WidenedVT =
9569 auto WidenedOp = DAG.getMemIntrinsicNode(
9570 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9571 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9572 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9573 DAG.getVectorIdxConstant(0, DL));
9574 return Subvector;
9575 }
9576
9578 DAG.getVTList(VT), Ops, VT, MMO);
9579 }
9580
9581 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9582 // assume that the buffer is unswizzled.
9583 SDValue Ops[] = {
9584 DAG.getEntryNode(), // Chain
9585 Rsrc, // rsrc
9586 DAG.getConstant(0, DL, MVT::i32), // vindex
9587 {}, // voffset
9588 {}, // soffset
9589 {}, // offset
9590 CachePolicy, // cachepolicy
9591 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9592 };
9593 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9594 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9595 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9596 }
9597
9599 unsigned NumLoads = 1;
9600 MVT LoadVT = VT.getSimpleVT();
9601 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9602 assert((LoadVT.getScalarType() == MVT::i32 ||
9603 LoadVT.getScalarType() == MVT::f32));
9604
9605 if (NumElts == 8 || NumElts == 16) {
9606 NumLoads = NumElts / 4;
9607 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9608 }
9609
9610 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9611
9612 // Use the alignment to ensure that the required offsets will fit into the
9613 // immediate offsets.
9614 setBufferOffsets(Offset, DAG, &Ops[3],
9615 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9616
9617 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9618 for (unsigned i = 0; i < NumLoads; ++i) {
9619 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9620 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9621 LoadVT, MMO, DAG));
9622 }
9623
9624 if (NumElts == 8 || NumElts == 16)
9625 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9626
9627 return Loads[0];
9628}
9629
9630SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9631 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9632 if (!Subtarget->hasArchitectedSGPRs())
9633 return {};
9634 SDLoc SL(Op);
9635 MVT VT = MVT::i32;
9636 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9637 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9638 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9639}
9640
9641SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9642 AMDGPU::Hwreg::Id HwReg,
9643 unsigned LowBit,
9644 unsigned Width) const {
9645 SDLoc SL(Op);
9646 using namespace AMDGPU::Hwreg;
9647 return {DAG.getMachineNode(
9648 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9649 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9650 SL, MVT::i32)),
9651 0};
9652}
9653
9654SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9655 unsigned Dim,
9656 const ArgDescriptor &Arg) const {
9657 SDLoc SL(Op);
9658 MachineFunction &MF = DAG.getMachineFunction();
9659 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9660 if (MaxID == 0)
9661 return DAG.getConstant(0, SL, MVT::i32);
9662
9663 // It's undefined behavior if a function marked with the amdgpu-no-*
9664 // attributes uses the corresponding intrinsic.
9665 if (!Arg)
9666 return DAG.getPOISON(Op->getValueType(0));
9667
9668 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9669 SDLoc(DAG.getEntryNode()), Arg);
9670
9671 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9672 // masking operations anyway.
9673 //
9674 // TODO: We could assert the top bit is 0 for the source copy.
9675 if (Arg.isMasked())
9676 return Val;
9677
9678 // Preserve the known bits after expansion to a copy.
9679 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9680 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9681 DAG.getValueType(SmallVT));
9682}
9683
9684SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9685 SelectionDAG &DAG) const {
9686 MachineFunction &MF = DAG.getMachineFunction();
9687 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9688
9689 EVT VT = Op.getValueType();
9690 SDLoc DL(Op);
9691 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9692
9693 // TODO: Should this propagate fast-math-flags?
9694
9695 switch (IntrinsicID) {
9696 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9697 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9698 return emitNonHSAIntrinsicError(DAG, DL, VT);
9699 return getPreloadedValue(DAG, *MFI, VT,
9701 }
9702 case Intrinsic::amdgcn_dispatch_ptr:
9703 case Intrinsic::amdgcn_queue_ptr: {
9704 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9705 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9706 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9707 DL.getDebugLoc()));
9708 return DAG.getPOISON(VT);
9709 }
9710
9711 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9714 return getPreloadedValue(DAG, *MFI, VT, RegID);
9715 }
9716 case Intrinsic::amdgcn_implicitarg_ptr: {
9717 if (MFI->isEntryFunction())
9718 return getImplicitArgPtr(DAG, DL);
9719 return getPreloadedValue(DAG, *MFI, VT,
9721 }
9722 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9724 // This only makes sense to call in a kernel, so just lower to null.
9725 return DAG.getConstant(0, DL, VT);
9726 }
9727
9728 return getPreloadedValue(DAG, *MFI, VT,
9730 }
9731 case Intrinsic::amdgcn_dispatch_id: {
9732 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9733 }
9734 case Intrinsic::amdgcn_rcp:
9735 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9736 case Intrinsic::amdgcn_rsq:
9737 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9738 case Intrinsic::amdgcn_rsq_legacy:
9739 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9740 return emitRemovedIntrinsicError(DAG, DL, VT);
9741 return SDValue();
9742 case Intrinsic::amdgcn_rcp_legacy:
9743 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9744 return emitRemovedIntrinsicError(DAG, DL, VT);
9745 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9746 case Intrinsic::amdgcn_rsq_clamp: {
9747 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9748 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9749
9750 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9751 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9752 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9753
9754 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9755 SDValue Tmp =
9756 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9757 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9758 DAG.getConstantFP(Min, DL, VT));
9759 }
9760 case Intrinsic::r600_read_ngroups_x:
9761 if (Subtarget->isAmdHsaOS())
9762 return emitNonHSAIntrinsicError(DAG, DL, VT);
9763
9764 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9766 false);
9767 case Intrinsic::r600_read_ngroups_y:
9768 if (Subtarget->isAmdHsaOS())
9769 return emitNonHSAIntrinsicError(DAG, DL, VT);
9770
9771 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9773 false);
9774 case Intrinsic::r600_read_ngroups_z:
9775 if (Subtarget->isAmdHsaOS())
9776 return emitNonHSAIntrinsicError(DAG, DL, VT);
9777
9778 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9780 false);
9781 case Intrinsic::r600_read_local_size_x:
9782 if (Subtarget->isAmdHsaOS())
9783 return emitNonHSAIntrinsicError(DAG, DL, VT);
9784
9785 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9787 case Intrinsic::r600_read_local_size_y:
9788 if (Subtarget->isAmdHsaOS())
9789 return emitNonHSAIntrinsicError(DAG, DL, VT);
9790
9791 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9793 case Intrinsic::r600_read_local_size_z:
9794 if (Subtarget->isAmdHsaOS())
9795 return emitNonHSAIntrinsicError(DAG, DL, VT);
9796
9797 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9799 case Intrinsic::amdgcn_workgroup_id_x:
9800 return lowerWorkGroupId(DAG, *MFI, VT,
9804 case Intrinsic::amdgcn_workgroup_id_y:
9805 return lowerWorkGroupId(DAG, *MFI, VT,
9809 case Intrinsic::amdgcn_workgroup_id_z:
9810 return lowerWorkGroupId(DAG, *MFI, VT,
9814 case Intrinsic::amdgcn_cluster_id_x:
9815 return Subtarget->hasClusters()
9816 ? getPreloadedValue(DAG, *MFI, VT,
9818 : DAG.getPOISON(VT);
9819 case Intrinsic::amdgcn_cluster_id_y:
9820 return Subtarget->hasClusters()
9821 ? getPreloadedValue(DAG, *MFI, VT,
9823 : DAG.getPOISON(VT);
9824 case Intrinsic::amdgcn_cluster_id_z:
9825 return Subtarget->hasClusters()
9826 ? getPreloadedValue(DAG, *MFI, VT,
9828 : DAG.getPOISON(VT);
9829 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9830 return Subtarget->hasClusters()
9831 ? getPreloadedValue(
9832 DAG, *MFI, VT,
9834 : DAG.getPOISON(VT);
9835 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9836 return Subtarget->hasClusters()
9837 ? getPreloadedValue(
9838 DAG, *MFI, VT,
9840 : DAG.getPOISON(VT);
9841 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9842 return Subtarget->hasClusters()
9843 ? getPreloadedValue(
9844 DAG, *MFI, VT,
9846 : DAG.getPOISON(VT);
9847 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9848 return Subtarget->hasClusters()
9849 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9850 : SDValue();
9851 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9852 return Subtarget->hasClusters()
9853 ? getPreloadedValue(
9854 DAG, *MFI, VT,
9856 : DAG.getPOISON(VT);
9857 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9858 return Subtarget->hasClusters()
9859 ? getPreloadedValue(
9860 DAG, *MFI, VT,
9862 : DAG.getPOISON(VT);
9863 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9864 return Subtarget->hasClusters()
9865 ? getPreloadedValue(
9866 DAG, *MFI, VT,
9868 : DAG.getPOISON(VT);
9869 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9870 return Subtarget->hasClusters()
9871 ? getPreloadedValue(
9872 DAG, *MFI, VT,
9874 : DAG.getPOISON(VT);
9875 case Intrinsic::amdgcn_wave_id:
9876 return lowerWaveID(DAG, Op);
9877 case Intrinsic::amdgcn_lds_kernel_id: {
9878 if (MFI->isEntryFunction())
9879 return getLDSKernelId(DAG, DL);
9880 return getPreloadedValue(DAG, *MFI, VT,
9882 }
9883 case Intrinsic::amdgcn_workitem_id_x:
9884 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9885 case Intrinsic::amdgcn_workitem_id_y:
9886 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9887 case Intrinsic::amdgcn_workitem_id_z:
9888 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9889 case Intrinsic::amdgcn_wavefrontsize:
9890 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9891 SDLoc(Op), MVT::i32);
9892 case Intrinsic::amdgcn_s_buffer_load: {
9893 unsigned CPol = Op.getConstantOperandVal(3);
9894 // s_buffer_load, because of how it's optimized, can't be volatile
9895 // so reject ones with the volatile bit set.
9896 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9899 return Op;
9900 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9901 Op.getOperand(3), DAG);
9902 }
9903 case Intrinsic::amdgcn_fdiv_fast:
9904 return lowerFDIV_FAST(Op, DAG);
9905 case Intrinsic::amdgcn_sin:
9906 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9907
9908 case Intrinsic::amdgcn_cos:
9909 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9910
9911 case Intrinsic::amdgcn_mul_u24:
9912 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9913 Op.getOperand(2));
9914 case Intrinsic::amdgcn_mul_i24:
9915 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9916 Op.getOperand(2));
9917
9918 case Intrinsic::amdgcn_log_clamp: {
9919 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9920 return SDValue();
9921
9922 return emitRemovedIntrinsicError(DAG, DL, VT);
9923 }
9924 case Intrinsic::amdgcn_fract:
9925 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9926
9927 case Intrinsic::amdgcn_class:
9928 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9929 Op.getOperand(2));
9930 case Intrinsic::amdgcn_div_fmas:
9931 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9932 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9933
9934 case Intrinsic::amdgcn_div_fixup:
9935 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9936 Op.getOperand(2), Op.getOperand(3));
9937
9938 case Intrinsic::amdgcn_div_scale: {
9939 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9940
9941 // Translate to the operands expected by the machine instruction. The
9942 // first parameter must be the same as the first instruction.
9943 SDValue Numerator = Op.getOperand(1);
9944 SDValue Denominator = Op.getOperand(2);
9945
9946 // Note this order is opposite of the machine instruction's operations,
9947 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9948 // intrinsic has the numerator as the first operand to match a normal
9949 // division operation.
9950
9951 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9952
9953 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9954 Denominator, Numerator);
9955 }
9956 case Intrinsic::amdgcn_icmp: {
9957 // There is a Pat that handles this variant, so return it as-is.
9958 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9959 Op.getConstantOperandVal(2) == 0 &&
9960 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9961 return Op;
9962 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9963 }
9964 case Intrinsic::amdgcn_fcmp: {
9965 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9966 }
9967 case Intrinsic::amdgcn_ballot:
9968 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9969 case Intrinsic::amdgcn_fmed3:
9970 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9971 Op.getOperand(2), Op.getOperand(3));
9972 case Intrinsic::amdgcn_fdot2:
9973 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9974 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9975 case Intrinsic::amdgcn_fmul_legacy:
9976 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9977 Op.getOperand(2));
9978 case Intrinsic::amdgcn_sffbh:
9979 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9980 case Intrinsic::amdgcn_sbfe:
9981 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9982 Op.getOperand(2), Op.getOperand(3));
9983 case Intrinsic::amdgcn_ubfe:
9984 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9985 Op.getOperand(2), Op.getOperand(3));
9986 case Intrinsic::amdgcn_cvt_pkrtz:
9987 case Intrinsic::amdgcn_cvt_pknorm_i16:
9988 case Intrinsic::amdgcn_cvt_pknorm_u16:
9989 case Intrinsic::amdgcn_cvt_pk_i16:
9990 case Intrinsic::amdgcn_cvt_pk_u16: {
9991 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9992 EVT VT = Op.getValueType();
9993 unsigned Opcode;
9994
9995 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9997 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9999 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10001 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10003 else
10005
10006 if (isTypeLegal(VT))
10007 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
10008
10009 SDValue Node =
10010 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10011 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10012 }
10013 case Intrinsic::amdgcn_fmad_ftz:
10014 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10015 Op.getOperand(2), Op.getOperand(3));
10016
10017 case Intrinsic::amdgcn_if_break:
10018 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10019 Op->getOperand(1), Op->getOperand(2)),
10020 0);
10021
10022 case Intrinsic::amdgcn_groupstaticsize: {
10024 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10025 return Op;
10026
10027 const Module *M = MF.getFunction().getParent();
10028 const GlobalValue *GV =
10029 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10030 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10032 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10033 }
10034 case Intrinsic::amdgcn_is_shared:
10035 case Intrinsic::amdgcn_is_private: {
10036 SDLoc SL(Op);
10037 SDValue SrcVec =
10038 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10039 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10040 DAG.getConstant(1, SL, MVT::i32));
10041
10042 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10044 : AMDGPUAS::PRIVATE_ADDRESS;
10045 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10046 Subtarget->hasGloballyAddressableScratch()) {
10047 SDValue FlatScratchBaseHi(
10048 DAG.getMachineNode(
10049 AMDGPU::S_MOV_B32, DL, MVT::i32,
10050 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10051 0);
10052 // Test bits 63..58 against the aperture address.
10053 return DAG.getSetCC(
10054 SL, MVT::i1,
10055 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10056 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10057 }
10058
10059 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10060 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10061 }
10062 case Intrinsic::amdgcn_perm:
10063 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10064 Op.getOperand(2), Op.getOperand(3));
10065 case Intrinsic::amdgcn_reloc_constant: {
10066 Module *M = MF.getFunction().getParent();
10067 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10068 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10069 auto *RelocSymbol = cast<GlobalVariable>(
10070 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10071 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10073 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10074 }
10075 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10076 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10077 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10078 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10079 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10080 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10081 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10082 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10083 if (Op.getOperand(4).getValueType() == MVT::i32)
10084 return SDValue();
10085
10086 SDLoc SL(Op);
10087 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10088 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10089 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10090 Op.getOperand(3), IndexKeyi32);
10091 }
10092 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10093 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10094 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10095 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10096 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10097 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10098 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10099 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10100 if (Op.getOperand(4).getValueType() == MVT::i64)
10101 return SDValue();
10102
10103 SDLoc SL(Op);
10104 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10105 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10106 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10107 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10108 Op.getOperand(6)});
10109 }
10110 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10111 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10112 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10113 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10114 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10115 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10116 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10117 ? MVT::i64
10118 : MVT::i32;
10119 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10120 return SDValue();
10121
10122 SDLoc SL(Op);
10123 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10124 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10125 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10126 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10127 IndexKey, Op.getOperand(7),
10128 Op.getOperand(8)}); // No clamp operand
10129 }
10130 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10131 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10132 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10133 if (Op.getOperand(6).getValueType() == MVT::i32)
10134 return SDValue();
10135
10136 SDLoc SL(Op);
10137 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10138 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10139 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10140 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10141 IndexKeyi32, Op.getOperand(7)});
10142 }
10143 case Intrinsic::amdgcn_addrspacecast_nonnull:
10144 return lowerADDRSPACECAST(Op, DAG);
10145 case Intrinsic::amdgcn_readlane:
10146 case Intrinsic::amdgcn_readfirstlane:
10147 case Intrinsic::amdgcn_writelane:
10148 case Intrinsic::amdgcn_permlane16:
10149 case Intrinsic::amdgcn_permlanex16:
10150 case Intrinsic::amdgcn_permlane64:
10151 case Intrinsic::amdgcn_set_inactive:
10152 case Intrinsic::amdgcn_set_inactive_chain_arg:
10153 case Intrinsic::amdgcn_mov_dpp8:
10154 case Intrinsic::amdgcn_update_dpp:
10155 return lowerLaneOp(*this, Op.getNode(), DAG);
10156 case Intrinsic::amdgcn_dead: {
10158 for (const EVT ValTy : Op.getNode()->values())
10159 Poisons.push_back(DAG.getPOISON(ValTy));
10160 return DAG.getMergeValues(Poisons, SDLoc(Op));
10161 }
10162 default:
10163 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10165 return lowerImage(Op, ImageDimIntr, DAG, false);
10166
10167 return Op;
10168 }
10169}
10170
10171// On targets not supporting constant in soffset field, turn zero to
10172// SGPR_NULL to avoid generating an extra s_mov with zero.
10174 const GCNSubtarget *Subtarget) {
10175 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10176 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10177 return SOffset;
10178}
10179
10180SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10181 SelectionDAG &DAG,
10182 unsigned NewOpcode) const {
10183 SDLoc DL(Op);
10184
10185 SDValue VData = Op.getOperand(2);
10186 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10187 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10188 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10189 SDValue Ops[] = {
10190 Op.getOperand(0), // Chain
10191 VData, // vdata
10192 Rsrc, // rsrc
10193 DAG.getConstant(0, DL, MVT::i32), // vindex
10194 VOffset, // voffset
10195 SOffset, // soffset
10196 Offset, // offset
10197 Op.getOperand(6), // cachepolicy
10198 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10199 };
10200
10201 auto *M = cast<MemSDNode>(Op);
10202
10203 EVT MemVT = VData.getValueType();
10204 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10205 M->getMemOperand());
10206}
10207
10208SDValue
10209SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10210 unsigned NewOpcode) const {
10211 SDLoc DL(Op);
10212
10213 SDValue VData = Op.getOperand(2);
10214 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10215 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10216 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10217 SDValue Ops[] = {
10218 Op.getOperand(0), // Chain
10219 VData, // vdata
10220 Rsrc, // rsrc
10221 Op.getOperand(4), // vindex
10222 VOffset, // voffset
10223 SOffset, // soffset
10224 Offset, // offset
10225 Op.getOperand(7), // cachepolicy
10226 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10227 };
10228
10229 auto *M = cast<MemSDNode>(Op);
10230
10231 EVT MemVT = VData.getValueType();
10232 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10233 M->getMemOperand());
10234}
10235
10236SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10237 SelectionDAG &DAG) const {
10238 unsigned IntrID = Op.getConstantOperandVal(1);
10239 SDLoc DL(Op);
10240
10241 switch (IntrID) {
10242 case Intrinsic::amdgcn_ds_ordered_add:
10243 case Intrinsic::amdgcn_ds_ordered_swap: {
10244 MemSDNode *M = cast<MemSDNode>(Op);
10245 SDValue Chain = M->getOperand(0);
10246 SDValue M0 = M->getOperand(2);
10247 SDValue Value = M->getOperand(3);
10248 unsigned IndexOperand = M->getConstantOperandVal(7);
10249 unsigned WaveRelease = M->getConstantOperandVal(8);
10250 unsigned WaveDone = M->getConstantOperandVal(9);
10251
10252 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10253 IndexOperand &= ~0x3f;
10254 unsigned CountDw = 0;
10255
10256 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10257 CountDw = (IndexOperand >> 24) & 0xf;
10258 IndexOperand &= ~(0xf << 24);
10259
10260 if (CountDw < 1 || CountDw > 4) {
10261 const Function &Fn = DAG.getMachineFunction().getFunction();
10262 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10263 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10264 DL.getDebugLoc()));
10265 CountDw = 1;
10266 }
10267 }
10268
10269 if (IndexOperand) {
10270 const Function &Fn = DAG.getMachineFunction().getFunction();
10271 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10272 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10273 }
10274
10275 if (WaveDone && !WaveRelease) {
10276 // TODO: Move this to IR verifier
10277 const Function &Fn = DAG.getMachineFunction().getFunction();
10278 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10279 Fn, "ds_ordered_count: wave_done requires wave_release",
10280 DL.getDebugLoc()));
10281 }
10282
10283 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10284 unsigned ShaderType =
10286 unsigned Offset0 = OrderedCountIndex << 2;
10287 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10288
10289 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10290 Offset1 |= (CountDw - 1) << 6;
10291
10292 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10293 Offset1 |= ShaderType << 2;
10294
10295 unsigned Offset = Offset0 | (Offset1 << 8);
10296
10297 SDValue Ops[] = {
10298 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10299 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10300 };
10302 M->getVTList(), Ops, M->getMemoryVT(),
10303 M->getMemOperand());
10304 }
10305 case Intrinsic::amdgcn_raw_buffer_load:
10306 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10307 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10308 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10309 case Intrinsic::amdgcn_raw_buffer_load_format:
10310 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10311 const bool IsFormat =
10312 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10313 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10314
10315 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10316 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10317 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10318 SDValue Ops[] = {
10319 Op.getOperand(0), // Chain
10320 Rsrc, // rsrc
10321 DAG.getConstant(0, DL, MVT::i32), // vindex
10322 VOffset, // voffset
10323 SOffset, // soffset
10324 Offset, // offset
10325 Op.getOperand(5), // cachepolicy, swizzled buffer
10326 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10327 };
10328
10329 auto *M = cast<MemSDNode>(Op);
10330 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10331 }
10332 case Intrinsic::amdgcn_struct_buffer_load:
10333 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10334 case Intrinsic::amdgcn_struct_buffer_load_format:
10335 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10336 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10337 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10338 const bool IsFormat =
10339 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10340 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10341
10342 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10343 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10344 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10345 SDValue Ops[] = {
10346 Op.getOperand(0), // Chain
10347 Rsrc, // rsrc
10348 Op.getOperand(3), // vindex
10349 VOffset, // voffset
10350 SOffset, // soffset
10351 Offset, // offset
10352 Op.getOperand(6), // cachepolicy, swizzled buffer
10353 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10354 };
10355
10356 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10357 }
10358 case Intrinsic::amdgcn_raw_tbuffer_load:
10359 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10360 MemSDNode *M = cast<MemSDNode>(Op);
10361 EVT LoadVT = Op.getValueType();
10362 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10363 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10364 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10365
10366 SDValue Ops[] = {
10367 Op.getOperand(0), // Chain
10368 Rsrc, // rsrc
10369 DAG.getConstant(0, DL, MVT::i32), // vindex
10370 VOffset, // voffset
10371 SOffset, // soffset
10372 Offset, // offset
10373 Op.getOperand(5), // format
10374 Op.getOperand(6), // cachepolicy, swizzled buffer
10375 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10376 };
10377
10378 if (LoadVT.getScalarType() == MVT::f16)
10379 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10380 Ops);
10381 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10382 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10383 DAG);
10384 }
10385 case Intrinsic::amdgcn_struct_tbuffer_load:
10386 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10387 MemSDNode *M = cast<MemSDNode>(Op);
10388 EVT LoadVT = Op.getValueType();
10389 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10390 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10391 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10392
10393 SDValue Ops[] = {
10394 Op.getOperand(0), // Chain
10395 Rsrc, // rsrc
10396 Op.getOperand(3), // vindex
10397 VOffset, // voffset
10398 SOffset, // soffset
10399 Offset, // offset
10400 Op.getOperand(6), // format
10401 Op.getOperand(7), // cachepolicy, swizzled buffer
10402 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10403 };
10404
10405 if (LoadVT.getScalarType() == MVT::f16)
10406 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10407 Ops);
10408 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10409 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10410 DAG);
10411 }
10412 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10414 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10415 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10416 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10417 return lowerStructBufferAtomicIntrin(Op, DAG,
10419 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10420 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10421 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10422 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10424 return lowerStructBufferAtomicIntrin(Op, DAG,
10426 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10427 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10428 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10429 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10430 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10431 return lowerStructBufferAtomicIntrin(Op, DAG,
10433 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10434 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10435 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10436 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10438 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10439 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10440 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10441 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10442 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10444 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10445 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10446 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10447 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10448 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10449 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10450 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10451 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10453 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10454 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10455 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10456 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10457 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10458 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10459 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10460 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10461 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10462 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10463 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10464 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10465 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10466 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10467 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10468 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10469 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10470 return lowerRawBufferAtomicIntrin(Op, DAG,
10472 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10473 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10474 return lowerStructBufferAtomicIntrin(Op, DAG,
10476 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10477 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10478 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10479 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10480 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10481 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10482 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10483 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10484 return lowerStructBufferAtomicIntrin(Op, DAG,
10486 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10487 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10488 return lowerStructBufferAtomicIntrin(Op, DAG,
10490 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10491 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10492 return lowerStructBufferAtomicIntrin(Op, DAG,
10494 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10495 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10496 return lowerStructBufferAtomicIntrin(Op, DAG,
10498 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10500 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10501 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10503 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10504 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10505 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10506 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10507 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10508 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10509 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10510 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10511 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10512 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10513 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10514 return lowerStructBufferAtomicIntrin(Op, DAG,
10516
10517 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10518 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10519 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10520 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10521 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10522 SDValue Ops[] = {
10523 Op.getOperand(0), // Chain
10524 Op.getOperand(2), // src
10525 Op.getOperand(3), // cmp
10526 Rsrc, // rsrc
10527 DAG.getConstant(0, DL, MVT::i32), // vindex
10528 VOffset, // voffset
10529 SOffset, // soffset
10530 Offset, // offset
10531 Op.getOperand(7), // cachepolicy
10532 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10533 };
10534 EVT VT = Op.getValueType();
10535 auto *M = cast<MemSDNode>(Op);
10536
10538 Op->getVTList(), Ops, VT,
10539 M->getMemOperand());
10540 }
10541 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10542 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10543 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10544 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10545 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10546 SDValue Ops[] = {
10547 Op.getOperand(0), // Chain
10548 Op.getOperand(2), // src
10549 Op.getOperand(3), // cmp
10550 Rsrc, // rsrc
10551 Op.getOperand(5), // vindex
10552 VOffset, // voffset
10553 SOffset, // soffset
10554 Offset, // offset
10555 Op.getOperand(8), // cachepolicy
10556 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10557 };
10558 EVT VT = Op.getValueType();
10559 auto *M = cast<MemSDNode>(Op);
10560
10562 Op->getVTList(), Ops, VT,
10563 M->getMemOperand());
10564 }
10565 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10566 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10567 MemSDNode *M = cast<MemSDNode>(Op);
10568 SDValue NodePtr = M->getOperand(2);
10569 SDValue RayExtent = M->getOperand(3);
10570 SDValue InstanceMask = M->getOperand(4);
10571 SDValue RayOrigin = M->getOperand(5);
10572 SDValue RayDir = M->getOperand(6);
10573 SDValue Offsets = M->getOperand(7);
10574 SDValue TDescr = M->getOperand(8);
10575
10576 assert(NodePtr.getValueType() == MVT::i64);
10577 assert(RayDir.getValueType() == MVT::v3f32);
10578
10579 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10580 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10581 return SDValue();
10582 }
10583
10584 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10585 const unsigned NumVDataDwords = 10;
10586 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10587 int Opcode = AMDGPU::getMIMGOpcode(
10588 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10589 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10590 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10591 assert(Opcode != -1);
10592
10594 Ops.push_back(NodePtr);
10595 Ops.push_back(DAG.getBuildVector(
10596 MVT::v2i32, DL,
10597 {DAG.getBitcast(MVT::i32, RayExtent),
10598 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10599 Ops.push_back(RayOrigin);
10600 Ops.push_back(RayDir);
10601 Ops.push_back(Offsets);
10602 Ops.push_back(TDescr);
10603 Ops.push_back(M->getChain());
10604
10605 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10606 MachineMemOperand *MemRef = M->getMemOperand();
10607 DAG.setNodeMemRefs(NewNode, {MemRef});
10608 return SDValue(NewNode, 0);
10609 }
10610 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10611 MemSDNode *M = cast<MemSDNode>(Op);
10612 SDValue NodePtr = M->getOperand(2);
10613 SDValue RayExtent = M->getOperand(3);
10614 SDValue RayOrigin = M->getOperand(4);
10615 SDValue RayDir = M->getOperand(5);
10616 SDValue RayInvDir = M->getOperand(6);
10617 SDValue TDescr = M->getOperand(7);
10618
10619 assert(NodePtr.getValueType() == MVT::i32 ||
10620 NodePtr.getValueType() == MVT::i64);
10621 assert(RayDir.getValueType() == MVT::v3f16 ||
10622 RayDir.getValueType() == MVT::v3f32);
10623
10624 if (!Subtarget->hasGFX10_AEncoding()) {
10625 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10626 return SDValue();
10627 }
10628
10629 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10630 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10631 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10632 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10633 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10634 const unsigned NumVDataDwords = 4;
10635 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10636 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10637 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10638 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10639 IsGFX12Plus;
10640 const unsigned BaseOpcodes[2][2] = {
10641 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10642 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10643 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10644 int Opcode;
10645 if (UseNSA) {
10646 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10647 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10648 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10649 : AMDGPU::MIMGEncGfx10NSA,
10650 NumVDataDwords, NumVAddrDwords);
10651 } else {
10652 assert(!IsGFX12Plus);
10653 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10654 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10655 : AMDGPU::MIMGEncGfx10Default,
10656 NumVDataDwords, NumVAddrDwords);
10657 }
10658 assert(Opcode != -1);
10659
10661
10662 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10664 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10665 if (Lanes[0].getValueSizeInBits() == 32) {
10666 for (unsigned I = 0; I < 3; ++I)
10667 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10668 } else {
10669 if (IsAligned) {
10670 Ops.push_back(DAG.getBitcast(
10671 MVT::i32,
10672 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10673 Ops.push_back(Lanes[2]);
10674 } else {
10675 SDValue Elt0 = Ops.pop_back_val();
10676 Ops.push_back(DAG.getBitcast(
10677 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10678 Ops.push_back(DAG.getBitcast(
10679 MVT::i32,
10680 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10681 }
10682 }
10683 };
10684
10685 if (UseNSA && IsGFX11Plus) {
10686 Ops.push_back(NodePtr);
10687 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10688 Ops.push_back(RayOrigin);
10689 if (IsA16) {
10690 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10691 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10692 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10693 for (unsigned I = 0; I < 3; ++I) {
10694 MergedLanes.push_back(DAG.getBitcast(
10695 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10696 {DirLanes[I], InvDirLanes[I]})));
10697 }
10698 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10699 } else {
10700 Ops.push_back(RayDir);
10701 Ops.push_back(RayInvDir);
10702 }
10703 } else {
10704 if (Is64)
10705 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10706 2);
10707 else
10708 Ops.push_back(NodePtr);
10709
10710 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10711 packLanes(RayOrigin, true);
10712 packLanes(RayDir, true);
10713 packLanes(RayInvDir, false);
10714 }
10715
10716 if (!UseNSA) {
10717 // Build a single vector containing all the operands so far prepared.
10718 if (NumVAddrDwords > 12) {
10719 SDValue Undef = DAG.getPOISON(MVT::i32);
10720 Ops.append(16 - Ops.size(), Undef);
10721 }
10722 assert(Ops.size() >= 8 && Ops.size() <= 12);
10723 SDValue MergedOps =
10724 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10725 Ops.clear();
10726 Ops.push_back(MergedOps);
10727 }
10728
10729 Ops.push_back(TDescr);
10730 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10731 Ops.push_back(M->getChain());
10732
10733 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10734 MachineMemOperand *MemRef = M->getMemOperand();
10735 DAG.setNodeMemRefs(NewNode, {MemRef});
10736 return SDValue(NewNode, 0);
10737 }
10738 case Intrinsic::amdgcn_global_atomic_fmin_num:
10739 case Intrinsic::amdgcn_global_atomic_fmax_num:
10740 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10741 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10742 MemSDNode *M = cast<MemSDNode>(Op);
10743 SDValue Ops[] = {
10744 M->getOperand(0), // Chain
10745 M->getOperand(2), // Ptr
10746 M->getOperand(3) // Value
10747 };
10748 unsigned Opcode = 0;
10749 switch (IntrID) {
10750 case Intrinsic::amdgcn_global_atomic_fmin_num:
10751 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10752 Opcode = ISD::ATOMIC_LOAD_FMIN;
10753 break;
10754 }
10755 case Intrinsic::amdgcn_global_atomic_fmax_num:
10756 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10757 Opcode = ISD::ATOMIC_LOAD_FMAX;
10758 break;
10759 }
10760 default:
10761 llvm_unreachable("unhandled atomic opcode");
10762 }
10763 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10764 Ops, M->getMemOperand());
10765 }
10766 case Intrinsic::amdgcn_s_get_barrier_state:
10767 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10768 SDValue Chain = Op->getOperand(0);
10770 unsigned Opc;
10771
10772 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10773 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10774 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10775 BarID = (BarID >> 4) & 0x3F;
10776 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10777 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10778 Ops.push_back(K);
10779 Ops.push_back(Chain);
10780 } else {
10781 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10782 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10783 SDValue M0Val;
10784 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10785 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10786 M0Val = SDValue(
10787 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10788 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10789 0);
10790 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10791 } else
10792 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10793 }
10794
10795 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10796 return SDValue(NewMI, 0);
10797 }
10798 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10799 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10800 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10801 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10802 SDValue Chain = Op->getOperand(0);
10803 SDValue Ptr = Op->getOperand(2);
10804 EVT VT = Op->getValueType(0);
10805 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10806 Chain, Ptr, MII->getMemOperand());
10807 }
10808 default:
10809
10810 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10812 return lowerImage(Op, ImageDimIntr, DAG, true);
10813
10814 return SDValue();
10815 }
10816}
10817
10818// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10819// dwordx4 if on SI and handle TFE loads.
10820SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10821 SDVTList VTList,
10822 ArrayRef<SDValue> Ops, EVT MemVT,
10823 MachineMemOperand *MMO,
10824 SelectionDAG &DAG) const {
10825 LLVMContext &C = *DAG.getContext();
10826 MachineFunction &MF = DAG.getMachineFunction();
10827 EVT VT = VTList.VTs[0];
10828
10829 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10830 bool IsTFE = VTList.NumVTs == 3;
10831 if (IsTFE) {
10832 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10833 unsigned NumOpDWords = NumValueDWords + 1;
10834 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10835 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10836 MachineMemOperand *OpDWordsMMO =
10837 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10838 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10839 OpDWordsVT, OpDWordsMMO, DAG);
10840 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10841 DAG.getVectorIdxConstant(NumValueDWords, DL));
10842 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10843 SDValue ValueDWords =
10844 NumValueDWords == 1
10845 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10847 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10848 ZeroIdx);
10849 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10850 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10851 }
10852
10853 if (!Subtarget->hasDwordx3LoadStores() &&
10854 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10855 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10856 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10857 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10858 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10859 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10860 WidenedMemVT, WidenedMMO);
10862 DAG.getVectorIdxConstant(0, DL));
10863 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10864 }
10865
10866 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10867}
10868
10869SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10870 bool ImageStore) const {
10871 EVT StoreVT = VData.getValueType();
10872
10873 // No change for f16 and legal vector D16 types.
10874 if (!StoreVT.isVector())
10875 return VData;
10876
10877 SDLoc DL(VData);
10878 unsigned NumElements = StoreVT.getVectorNumElements();
10879
10880 if (Subtarget->hasUnpackedD16VMem()) {
10881 // We need to unpack the packed data to store.
10882 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10883 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10884
10885 EVT EquivStoreVT =
10886 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10887 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10888 return DAG.UnrollVectorOp(ZExt.getNode());
10889 }
10890
10891 // The sq block of gfx8.1 does not estimate register use correctly for d16
10892 // image store instructions. The data operand is computed as if it were not a
10893 // d16 image instruction.
10894 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10895 // Bitcast to i16
10896 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10897 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10898
10899 // Decompose into scalars
10901 DAG.ExtractVectorElements(IntVData, Elts);
10902
10903 // Group pairs of i16 into v2i16 and bitcast to i32
10904 SmallVector<SDValue, 4> PackedElts;
10905 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10906 SDValue Pair =
10907 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10908 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10909 PackedElts.push_back(IntPair);
10910 }
10911 if ((NumElements % 2) == 1) {
10912 // Handle v3i16
10913 unsigned I = Elts.size() / 2;
10914 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10915 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10916 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10917 PackedElts.push_back(IntPair);
10918 }
10919
10920 // Pad using UNDEF
10921 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10922
10923 // Build final vector
10924 EVT VecVT =
10925 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10926 return DAG.getBuildVector(VecVT, DL, PackedElts);
10927 }
10928
10929 if (NumElements == 3) {
10930 EVT IntStoreVT =
10932 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10933
10934 EVT WidenedStoreVT = EVT::getVectorVT(
10935 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10936 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10937 WidenedStoreVT.getStoreSizeInBits());
10938 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10939 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10940 }
10941
10942 assert(isTypeLegal(StoreVT));
10943 return VData;
10944}
10945
10946SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10947 SelectionDAG &DAG) const {
10948 SDLoc DL(Op);
10949 SDValue Chain = Op.getOperand(0);
10950 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10951 MachineFunction &MF = DAG.getMachineFunction();
10952
10953 switch (IntrinsicID) {
10954 case Intrinsic::amdgcn_exp_compr: {
10955 if (!Subtarget->hasCompressedExport()) {
10956 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10958 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10959 }
10960 SDValue Src0 = Op.getOperand(4);
10961 SDValue Src1 = Op.getOperand(5);
10962 // Hack around illegal type on SI by directly selecting it.
10963 if (isTypeLegal(Src0.getValueType()))
10964 return SDValue();
10965
10966 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10967 SDValue Undef = DAG.getPOISON(MVT::f32);
10968 const SDValue Ops[] = {
10969 Op.getOperand(2), // tgt
10970 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10971 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10972 Undef, // src2
10973 Undef, // src3
10974 Op.getOperand(7), // vm
10975 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10976 Op.getOperand(3), // en
10977 Op.getOperand(0) // Chain
10978 };
10979
10980 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10981 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10982 }
10983
10984 case Intrinsic::amdgcn_struct_tbuffer_store:
10985 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10986 SDValue VData = Op.getOperand(2);
10987 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10988 if (IsD16)
10989 VData = handleD16VData(VData, DAG);
10990 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10991 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10992 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10993 SDValue Ops[] = {
10994 Chain,
10995 VData, // vdata
10996 Rsrc, // rsrc
10997 Op.getOperand(4), // vindex
10998 VOffset, // voffset
10999 SOffset, // soffset
11000 Offset, // offset
11001 Op.getOperand(7), // format
11002 Op.getOperand(8), // cachepolicy, swizzled buffer
11003 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11004 };
11005 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11007 MemSDNode *M = cast<MemSDNode>(Op);
11008 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11009 M->getMemoryVT(), M->getMemOperand());
11010 }
11011
11012 case Intrinsic::amdgcn_raw_tbuffer_store:
11013 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11014 SDValue VData = Op.getOperand(2);
11015 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11016 if (IsD16)
11017 VData = handleD16VData(VData, DAG);
11018 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11019 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11020 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11021 SDValue Ops[] = {
11022 Chain,
11023 VData, // vdata
11024 Rsrc, // rsrc
11025 DAG.getConstant(0, DL, MVT::i32), // vindex
11026 VOffset, // voffset
11027 SOffset, // soffset
11028 Offset, // offset
11029 Op.getOperand(6), // format
11030 Op.getOperand(7), // cachepolicy, swizzled buffer
11031 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11032 };
11033 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11035 MemSDNode *M = cast<MemSDNode>(Op);
11036 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11037 M->getMemoryVT(), M->getMemOperand());
11038 }
11039
11040 case Intrinsic::amdgcn_raw_buffer_store:
11041 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11042 case Intrinsic::amdgcn_raw_buffer_store_format:
11043 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11044 const bool IsFormat =
11045 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11046 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11047
11048 SDValue VData = Op.getOperand(2);
11049 EVT VDataVT = VData.getValueType();
11050 EVT EltType = VDataVT.getScalarType();
11051 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11052 if (IsD16) {
11053 VData = handleD16VData(VData, DAG);
11054 VDataVT = VData.getValueType();
11055 }
11056
11057 if (!isTypeLegal(VDataVT)) {
11058 VData =
11059 DAG.getNode(ISD::BITCAST, DL,
11060 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11061 }
11062
11063 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11064 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11065 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11066 SDValue Ops[] = {
11067 Chain,
11068 VData,
11069 Rsrc,
11070 DAG.getConstant(0, DL, MVT::i32), // vindex
11071 VOffset, // voffset
11072 SOffset, // soffset
11073 Offset, // offset
11074 Op.getOperand(6), // cachepolicy, swizzled buffer
11075 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11076 };
11077 unsigned Opc =
11080 MemSDNode *M = cast<MemSDNode>(Op);
11081
11082 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11083 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11084 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11085
11086 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11087 M->getMemoryVT(), M->getMemOperand());
11088 }
11089
11090 case Intrinsic::amdgcn_struct_buffer_store:
11091 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11092 case Intrinsic::amdgcn_struct_buffer_store_format:
11093 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11094 const bool IsFormat =
11095 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11096 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11097
11098 SDValue VData = Op.getOperand(2);
11099 EVT VDataVT = VData.getValueType();
11100 EVT EltType = VDataVT.getScalarType();
11101 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11102
11103 if (IsD16) {
11104 VData = handleD16VData(VData, DAG);
11105 VDataVT = VData.getValueType();
11106 }
11107
11108 if (!isTypeLegal(VDataVT)) {
11109 VData =
11110 DAG.getNode(ISD::BITCAST, DL,
11111 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11112 }
11113
11114 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11115 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11116 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11117 SDValue Ops[] = {
11118 Chain,
11119 VData,
11120 Rsrc,
11121 Op.getOperand(4), // vindex
11122 VOffset, // voffset
11123 SOffset, // soffset
11124 Offset, // offset
11125 Op.getOperand(7), // cachepolicy, swizzled buffer
11126 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11127 };
11128 unsigned Opc =
11131 MemSDNode *M = cast<MemSDNode>(Op);
11132
11133 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11134 EVT VDataType = VData.getValueType().getScalarType();
11135 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11136 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11137
11138 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11139 M->getMemoryVT(), M->getMemOperand());
11140 }
11141 case Intrinsic::amdgcn_raw_buffer_load_lds:
11142 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11143 case Intrinsic::amdgcn_struct_buffer_load_lds:
11144 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11145 if (!Subtarget->hasVMemToLDSLoad())
11146 return SDValue();
11147 unsigned Opc;
11148 bool HasVIndex =
11149 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11150 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11151 unsigned OpOffset = HasVIndex ? 1 : 0;
11152 SDValue VOffset = Op.getOperand(5 + OpOffset);
11153 bool HasVOffset = !isNullConstant(VOffset);
11154 unsigned Size = Op->getConstantOperandVal(4);
11155
11156 switch (Size) {
11157 default:
11158 return SDValue();
11159 case 1:
11160 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11161 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11162 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11163 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11164 break;
11165 case 2:
11166 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11167 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11168 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11169 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11170 break;
11171 case 4:
11172 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11173 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11174 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11175 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11176 break;
11177 case 12:
11178 if (!Subtarget->hasLDSLoadB96_B128())
11179 return SDValue();
11180 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11181 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11182 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11183 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11184 break;
11185 case 16:
11186 if (!Subtarget->hasLDSLoadB96_B128())
11187 return SDValue();
11188 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11189 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11190 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11191 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11192 break;
11193 }
11194
11195 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11196
11198
11199 if (HasVIndex && HasVOffset)
11200 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11201 {Op.getOperand(5), // VIndex
11202 VOffset}));
11203 else if (HasVIndex)
11204 Ops.push_back(Op.getOperand(5));
11205 else if (HasVOffset)
11206 Ops.push_back(VOffset);
11207
11208 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11209 Ops.push_back(Rsrc);
11210 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11211 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11212 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11213 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11214 Ops.push_back(DAG.getTargetConstant(
11215 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11216 DL, MVT::i8)); // cpol
11217 Ops.push_back(DAG.getTargetConstant(
11218 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11219 ? 1
11220 : 0,
11221 DL, MVT::i8)); // swz
11222 Ops.push_back(M0Val.getValue(0)); // Chain
11223 Ops.push_back(M0Val.getValue(1)); // Glue
11224
11225 auto *M = cast<MemSDNode>(Op);
11226 MachineMemOperand *LoadMMO = M->getMemOperand();
11227 // Don't set the offset value here because the pointer points to the base of
11228 // the buffer.
11229 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11230
11231 MachinePointerInfo StorePtrI = LoadPtrI;
11232 LoadPtrI.V = PoisonValue::get(
11236
11237 auto F = LoadMMO->getFlags() &
11239 LoadMMO =
11241 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11242
11243 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11244 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11245 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11246
11247 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11248 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11249
11250 return SDValue(Load, 0);
11251 }
11252 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11253 // for "trust me" that the remaining cases are global pointers until
11254 // such time as we can put two mem operands on an intrinsic.
11255 case Intrinsic::amdgcn_load_to_lds:
11256 case Intrinsic::amdgcn_global_load_lds: {
11257 if (!Subtarget->hasVMemToLDSLoad())
11258 return SDValue();
11259
11260 unsigned Opc;
11261 unsigned Size = Op->getConstantOperandVal(4);
11262 switch (Size) {
11263 default:
11264 return SDValue();
11265 case 1:
11266 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11267 break;
11268 case 2:
11269 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11270 break;
11271 case 4:
11272 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11273 break;
11274 case 12:
11275 if (!Subtarget->hasLDSLoadB96_B128())
11276 return SDValue();
11277 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11278 break;
11279 case 16:
11280 if (!Subtarget->hasLDSLoadB96_B128())
11281 return SDValue();
11282 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11283 break;
11284 }
11285
11286 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11287
11289
11290 SDValue Addr = Op.getOperand(2); // Global ptr
11291 SDValue VOffset;
11292 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11293 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11294 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11295 SDValue LHS = Addr.getOperand(0);
11296 SDValue RHS = Addr.getOperand(1);
11297
11298 if (LHS->isDivergent())
11299 std::swap(LHS, RHS);
11300
11301 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11302 RHS.getOperand(0).getValueType() == MVT::i32) {
11303 // add (i64 sgpr), (zero_extend (i32 vgpr))
11304 Addr = LHS;
11305 VOffset = RHS.getOperand(0);
11306 }
11307 }
11308
11309 Ops.push_back(Addr);
11310 if (!Addr->isDivergent()) {
11312 if (!VOffset)
11313 VOffset =
11314 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11315 DAG.getTargetConstant(0, DL, MVT::i32)),
11316 0);
11317 Ops.push_back(VOffset);
11318 }
11319
11320 Ops.push_back(Op.getOperand(5)); // Offset
11321 Ops.push_back(Op.getOperand(6)); // CPol
11322 Ops.push_back(M0Val.getValue(0)); // Chain
11323 Ops.push_back(M0Val.getValue(1)); // Glue
11324
11325 auto *M = cast<MemSDNode>(Op);
11326 MachineMemOperand *LoadMMO = M->getMemOperand();
11327 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11328 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11329 MachinePointerInfo StorePtrI = LoadPtrI;
11330 LoadPtrI.V = PoisonValue::get(
11334 auto F = LoadMMO->getFlags() &
11336 LoadMMO =
11338 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11339 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11340 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11341 LoadMMO->getAAInfo());
11342
11343 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11344 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11345
11346 return SDValue(Load, 0);
11347 }
11348 case Intrinsic::amdgcn_end_cf:
11349 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11350 Op->getOperand(2), Chain),
11351 0);
11352 case Intrinsic::amdgcn_s_barrier_init:
11353 case Intrinsic::amdgcn_s_barrier_signal_var: {
11354 // these two intrinsics have two operands: barrier pointer and member count
11355 SDValue Chain = Op->getOperand(0);
11357 SDValue BarOp = Op->getOperand(2);
11358 SDValue CntOp = Op->getOperand(3);
11359 SDValue M0Val;
11360 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11361 ? AMDGPU::S_BARRIER_INIT_M0
11362 : AMDGPU::S_BARRIER_SIGNAL_M0;
11363 // extract the BarrierID from bits 4-9 of BarOp
11364 SDValue BarID;
11365 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11366 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11367 BarID =
11368 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11369 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11370 0);
11371 // Member count should be put into M0[ShAmt:+6]
11372 // Barrier ID should be put into M0[5:0]
11373 M0Val =
11374 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11375 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11376 0);
11377 constexpr unsigned ShAmt = 16;
11378 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11379 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11380
11381 M0Val = SDValue(
11382 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11383
11384 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11385
11386 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11387 return SDValue(NewMI, 0);
11388 }
11389 case Intrinsic::amdgcn_s_barrier_join: {
11390 // these three intrinsics have one operand: barrier pointer
11391 SDValue Chain = Op->getOperand(0);
11393 SDValue BarOp = Op->getOperand(2);
11394 unsigned Opc;
11395
11396 if (isa<ConstantSDNode>(BarOp)) {
11397 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11398 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11399
11400 // extract the BarrierID from bits 4-9 of the immediate
11401 unsigned BarID = (BarVal >> 4) & 0x3F;
11402 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11403 Ops.push_back(K);
11404 Ops.push_back(Chain);
11405 } else {
11406 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11407
11408 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11409 SDValue M0Val;
11410 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11411 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11412 M0Val =
11413 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11414 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11415 0);
11416 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11417 }
11418
11419 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11420 return SDValue(NewMI, 0);
11421 }
11422 case Intrinsic::amdgcn_s_prefetch_data: {
11423 // For non-global address space preserve the chain and remove the call.
11425 return Op.getOperand(0);
11426 return Op;
11427 }
11428 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11429 SDValue Ops[] = {
11430 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11431 Op.getOperand(3), // offset
11432 Op.getOperand(4), // length
11433 };
11434
11435 MemSDNode *M = cast<MemSDNode>(Op);
11437 Op->getVTList(), Ops, M->getMemoryVT(),
11438 M->getMemOperand());
11439 }
11440 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11441 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11442 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11443 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11444 SDValue Chain = Op->getOperand(0);
11445 SDValue Ptr = Op->getOperand(2);
11446 SDValue Val = Op->getOperand(3);
11447 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11448 Ptr, MII->getMemOperand());
11449 }
11450 default: {
11451 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11453 return lowerImage(Op, ImageDimIntr, DAG, true);
11454
11455 return Op;
11456 }
11457 }
11458}
11459
11460// Return whether the operation has NoUnsignedWrap property.
11461static bool isNoUnsignedWrap(SDValue Addr) {
11462 return (Addr.getOpcode() == ISD::ADD &&
11463 Addr->getFlags().hasNoUnsignedWrap()) ||
11464 Addr->getOpcode() == ISD::OR;
11465}
11466
11468 EVT PtrVT) const {
11469 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11470}
11471
11473 EVT PtrVT) const {
11474 return true;
11475}
11476
11477// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11478// offset (the offset that is included in bounds checking and swizzling, to be
11479// split between the instruction's voffset and immoffset fields) and soffset
11480// (the offset that is excluded from bounds checking and swizzling, to go in
11481// the instruction's soffset field). This function takes the first kind of
11482// offset and figures out how to split it between voffset and immoffset.
11483std::pair<SDValue, SDValue>
11484SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11485 SDLoc DL(Offset);
11486 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11487 SDValue N0 = Offset;
11488 ConstantSDNode *C1 = nullptr;
11489
11490 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11491 N0 = SDValue();
11492 else if (DAG.isBaseWithConstantOffset(N0)) {
11493 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11494 // being added, so we can only safely match a 32-bit addition with no
11495 // unsigned overflow.
11496 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11497 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11498 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11499 N0 = N0.getOperand(0);
11500 }
11501 }
11502
11503 if (C1) {
11504 unsigned ImmOffset = C1->getZExtValue();
11505 // If the immediate value is too big for the immoffset field, put only bits
11506 // that would normally fit in the immoffset field. The remaining value that
11507 // is copied/added for the voffset field is a large power of 2, and it
11508 // stands more chance of being CSEd with the copy/add for another similar
11509 // load/store.
11510 // However, do not do that rounding down if that is a negative
11511 // number, as it appears to be illegal to have a negative offset in the
11512 // vgpr, even if adding the immediate offset makes it positive.
11513 unsigned Overflow = ImmOffset & ~MaxImm;
11514 ImmOffset -= Overflow;
11515 if ((int32_t)Overflow < 0) {
11516 Overflow += ImmOffset;
11517 ImmOffset = 0;
11518 }
11519 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11520 if (Overflow) {
11521 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11522 if (!N0)
11523 N0 = OverflowVal;
11524 else {
11525 SDValue Ops[] = {N0, OverflowVal};
11526 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11527 }
11528 }
11529 }
11530 if (!N0)
11531 N0 = DAG.getConstant(0, DL, MVT::i32);
11532 if (!C1)
11533 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11534 return {N0, SDValue(C1, 0)};
11535}
11536
11537// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11538// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11539// pointed to by Offsets.
11540void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11541 SelectionDAG &DAG, SDValue *Offsets,
11542 Align Alignment) const {
11543 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11544 SDLoc DL(CombinedOffset);
11545 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11546 uint32_t Imm = C->getZExtValue();
11547 uint32_t SOffset, ImmOffset;
11548 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11549 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11550 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11551 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11552 return;
11553 }
11554 }
11555 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11556 SDValue N0 = CombinedOffset.getOperand(0);
11557 SDValue N1 = CombinedOffset.getOperand(1);
11558 uint32_t SOffset, ImmOffset;
11559 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11560 if (Offset >= 0 &&
11561 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11562 Offsets[0] = N0;
11563 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11564 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11565 return;
11566 }
11567 }
11568
11569 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11570 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11571 : DAG.getConstant(0, DL, MVT::i32);
11572
11573 Offsets[0] = CombinedOffset;
11574 Offsets[1] = SOffsetZero;
11575 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11576}
11577
11578SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11579 SelectionDAG &DAG) const {
11580 if (!MaybePointer.getValueType().isScalarInteger())
11581 return MaybePointer;
11582
11583 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11584 return Rsrc;
11585}
11586
11587// Wrap a global or flat pointer into a buffer intrinsic using the flags
11588// specified in the intrinsic.
11589SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11590 SelectionDAG &DAG) const {
11591 SDLoc Loc(Op);
11592
11593 SDValue Pointer = Op->getOperand(1);
11594 SDValue Stride = Op->getOperand(2);
11595 SDValue NumRecords = Op->getOperand(3);
11596 SDValue Flags = Op->getOperand(4);
11597
11598 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11599 SDValue Rsrc;
11600
11601 if (Subtarget->has45BitNumRecordsBufferResource()) {
11602 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11603 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11604 // num_records.
11605 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11606 SDValue NumRecordsLHS =
11607 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11608 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11609 SDValue LowHalf =
11610 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11611
11612 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11613 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11614 SDValue NumRecordsRHS =
11615 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11616 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11617 SDValue ShiftedStride =
11618 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11619 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11620 SDValue ExtShiftedStrideVec =
11621 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11622 SDValue ExtShiftedStride =
11623 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11624 SDValue ShiftedFlags =
11625 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11626 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11627 SDValue ExtShiftedFlagsVec =
11628 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11629 SDValue ExtShiftedFlags =
11630 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11631 SDValue CombinedFields =
11632 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11633 SDValue HighHalf =
11634 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11635
11636 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11637 } else {
11638 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11639 auto [LowHalf, HighHalf] =
11640 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11641 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11642 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11643 SDValue ShiftedStride =
11644 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11645 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11646 SDValue NewHighHalf =
11647 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11648
11649 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11650 NumRecords, Flags);
11651 }
11652
11653 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11654 return RsrcPtr;
11655}
11656
11657// Handle 8 bit and 16 bit buffer loads
11658SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11659 EVT LoadVT, SDLoc DL,
11661 MachineMemOperand *MMO,
11662 bool IsTFE) const {
11663 EVT IntVT = LoadVT.changeTypeToInteger();
11664
11665 if (IsTFE) {
11666 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11669 MachineFunction &MF = DAG.getMachineFunction();
11670 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11671 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11672 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11673 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11674 DAG.getConstant(1, DL, MVT::i32));
11675 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11676 DAG.getConstant(0, DL, MVT::i32));
11677 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11678 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11679 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11680 }
11681
11682 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11685
11686 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11687 SDValue BufferLoad =
11688 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11689 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11690 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11691
11692 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11693}
11694
11695// Handle 8 bit and 16 bit buffer stores
11696SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11697 EVT VDataType, SDLoc DL,
11698 SDValue Ops[],
11699 MemSDNode *M) const {
11700 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11701 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11702
11703 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11704 Ops[1] = BufferStoreExt;
11705 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11706 : AMDGPUISD::BUFFER_STORE_SHORT;
11707 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11708 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11709 M->getMemOperand());
11710}
11711
11713 SDValue Op, const SDLoc &SL, EVT VT) {
11714 if (VT.bitsLT(Op.getValueType()))
11715 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11716
11717 switch (ExtType) {
11718 case ISD::SEXTLOAD:
11719 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11720 case ISD::ZEXTLOAD:
11721 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11722 case ISD::EXTLOAD:
11723 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11724 case ISD::NON_EXTLOAD:
11725 return Op;
11726 }
11727
11728 llvm_unreachable("invalid ext type");
11729}
11730
11731// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11732// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11733SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11734 DAGCombinerInfo &DCI) const {
11735 SelectionDAG &DAG = DCI.DAG;
11736 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11737 return SDValue();
11738
11739 // FIXME: Constant loads should all be marked invariant.
11740 unsigned AS = Ld->getAddressSpace();
11741 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11743 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11744 return SDValue();
11745
11746 // Don't do this early, since it may interfere with adjacent load merging for
11747 // illegal types. We can avoid losing alignment information for exotic types
11748 // pre-legalize.
11749 EVT MemVT = Ld->getMemoryVT();
11750 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11751 MemVT.getSizeInBits() >= 32)
11752 return SDValue();
11753
11754 SDLoc SL(Ld);
11755
11756 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11757 "unexpected vector extload");
11758
11759 // TODO: Drop only high part of range.
11760 SDValue Ptr = Ld->getBasePtr();
11761 SDValue NewLoad = DAG.getLoad(
11762 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11763 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11764 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11765 nullptr); // Drop ranges
11766
11767 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11768 if (MemVT.isFloatingPoint()) {
11770 "unexpected fp extload");
11771 TruncVT = MemVT.changeTypeToInteger();
11772 }
11773
11774 SDValue Cvt = NewLoad;
11775 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11776 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11777 DAG.getValueType(TruncVT));
11778 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11780 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11781 } else {
11783 }
11784
11785 EVT VT = Ld->getValueType(0);
11786 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11787
11788 DCI.AddToWorklist(Cvt.getNode());
11789
11790 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11791 // the appropriate extension from the 32-bit load.
11792 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11793 DCI.AddToWorklist(Cvt.getNode());
11794
11795 // Handle conversion back to floating point if necessary.
11796 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11797
11798 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11799}
11800
11802 const SIMachineFunctionInfo &Info) {
11803 // TODO: Should check if the address can definitely not access stack.
11804 if (Info.isEntryFunction())
11805 return Info.getUserSGPRInfo().hasFlatScratchInit();
11806 return true;
11807}
11808
11809SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11810 SDLoc DL(Op);
11811 LoadSDNode *Load = cast<LoadSDNode>(Op);
11812 ISD::LoadExtType ExtType = Load->getExtensionType();
11813 EVT MemVT = Load->getMemoryVT();
11814 MachineMemOperand *MMO = Load->getMemOperand();
11815
11816 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11817 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11818 return SDValue();
11819
11820 // FIXME: Copied from PPC
11821 // First, load into 32 bits, then truncate to 1 bit.
11822
11823 SDValue Chain = Load->getChain();
11824 SDValue BasePtr = Load->getBasePtr();
11825
11826 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11827
11828 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11829 RealMemVT, MMO);
11830
11831 if (!MemVT.isVector()) {
11832 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11833 NewLD.getValue(1)};
11834
11835 return DAG.getMergeValues(Ops, DL);
11836 }
11837
11839 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11840 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11841 DAG.getConstant(I, DL, MVT::i32));
11842
11843 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11844 }
11845
11846 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11847
11848 return DAG.getMergeValues(Ops, DL);
11849 }
11850
11851 if (!MemVT.isVector())
11852 return SDValue();
11853
11854 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11855 "Custom lowering for non-i32 vectors hasn't been implemented.");
11856
11857 Align Alignment = Load->getAlign();
11858 unsigned AS = Load->getAddressSpace();
11859 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11860 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11861 return SplitVectorLoad(Op, DAG);
11862 }
11863
11864 MachineFunction &MF = DAG.getMachineFunction();
11865 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11866 // If there is a possibility that flat instruction access scratch memory
11867 // then we need to use the same legalization rules we use for private.
11868 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11869 !Subtarget->hasMultiDwordFlatScratchAddressing())
11870 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11873
11874 unsigned NumElements = MemVT.getVectorNumElements();
11875
11876 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11878 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11879 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11881 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11882 Alignment >= Align(4) && NumElements < 32) {
11883 if (MemVT.isPow2VectorType() ||
11884 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11885 return SDValue();
11886 return WidenOrSplitVectorLoad(Op, DAG);
11887 }
11888 // Non-uniform loads will be selected to MUBUF instructions, so they
11889 // have the same legalization requirements as global and private
11890 // loads.
11891 //
11892 }
11893 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11896 if (NumElements > 4)
11897 return SplitVectorLoad(Op, DAG);
11898 // v3 loads not supported on SI.
11899 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11900 return WidenOrSplitVectorLoad(Op, DAG);
11901
11902 // v3 and v4 loads are supported for private and global memory.
11903 return SDValue();
11904 }
11905 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11906 // Depending on the setting of the private_element_size field in the
11907 // resource descriptor, we can only make private accesses up to a certain
11908 // size.
11909 switch (Subtarget->getMaxPrivateElementSize()) {
11910 case 4: {
11911 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11912 return DAG.getMergeValues({Op0, Op1}, DL);
11913 }
11914 case 8:
11915 if (NumElements > 2)
11916 return SplitVectorLoad(Op, DAG);
11917 return SDValue();
11918 case 16:
11919 // Same as global/flat
11920 if (NumElements > 4)
11921 return SplitVectorLoad(Op, DAG);
11922 // v3 loads not supported on SI.
11923 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11924 return WidenOrSplitVectorLoad(Op, DAG);
11925
11926 return SDValue();
11927 default:
11928 llvm_unreachable("unsupported private_element_size");
11929 }
11930 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11931 unsigned Fast = 0;
11932 auto Flags = Load->getMemOperand()->getFlags();
11934 Load->getAlign(), Flags, &Fast) &&
11935 Fast > 1)
11936 return SDValue();
11937
11938 if (MemVT.isVector())
11939 return SplitVectorLoad(Op, DAG);
11940 }
11941
11943 MemVT, *Load->getMemOperand())) {
11944 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11945 return DAG.getMergeValues({Op0, Op1}, DL);
11946 }
11947
11948 return SDValue();
11949}
11950
11951SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11952 EVT VT = Op.getValueType();
11953 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11954 VT.getSizeInBits() == 512)
11955 return splitTernaryVectorOp(Op, DAG);
11956
11957 assert(VT.getSizeInBits() == 64);
11958
11959 SDLoc DL(Op);
11960 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11961
11962 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11963 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11964
11965 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11966 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11967
11968 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11969 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11970
11971 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11972
11973 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11974 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11975
11976 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11977
11978 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11979 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11980}
11981
11982// Catch division cases where we can use shortcuts with rcp and rsq
11983// instructions.
11984SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11985 SelectionDAG &DAG) const {
11986 SDLoc SL(Op);
11987 SDValue LHS = Op.getOperand(0);
11988 SDValue RHS = Op.getOperand(1);
11989 EVT VT = Op.getValueType();
11990 const SDNodeFlags Flags = Op->getFlags();
11991
11992 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11993
11994 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11995 // Without !fpmath accuracy information, we can't do more because we don't
11996 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11997 // f16 is always accurate enough
11998 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11999 return SDValue();
12000
12001 if (CLHS->isExactlyValue(1.0)) {
12002 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
12003 // the CI documentation has a worst case error of 1 ulp.
12004 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
12005 // use it as long as we aren't trying to use denormals.
12006 //
12007 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
12008
12009 // 1.0 / sqrt(x) -> rsq(x)
12010
12011 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12012 // error seems really high at 2^29 ULP.
12013 // 1.0 / x -> rcp(x)
12014 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12015 }
12016
12017 // Same as for 1.0, but expand the sign out of the constant.
12018 if (CLHS->isExactlyValue(-1.0)) {
12019 // -1.0 / x -> rcp (fneg x)
12020 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12021 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12022 }
12023 }
12024
12025 // For f16 and bf16 require afn or arcp.
12026 // For f32 require afn.
12027 if (!AllowInaccurateRcp &&
12028 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12029 return SDValue();
12030
12031 // Turn into multiply by the reciprocal.
12032 // x / y -> x * (1.0 / y)
12033 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12034 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12035}
12036
12037SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12038 SelectionDAG &DAG) const {
12039 SDLoc SL(Op);
12040 SDValue X = Op.getOperand(0);
12041 SDValue Y = Op.getOperand(1);
12042 EVT VT = Op.getValueType();
12043 const SDNodeFlags Flags = Op->getFlags();
12044
12045 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12046 if (!AllowInaccurateDiv)
12047 return SDValue();
12048
12049 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12050 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12051
12052 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12053 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12054
12055 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12056 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12057 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12058 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12059 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12060 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12061}
12062
12063static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12064 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12065 SDNodeFlags Flags) {
12066 if (GlueChain->getNumValues() <= 1) {
12067 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12068 }
12069
12070 assert(GlueChain->getNumValues() == 3);
12071
12072 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12073 switch (Opcode) {
12074 default:
12075 llvm_unreachable("no chain equivalent for opcode");
12076 case ISD::FMUL:
12077 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12078 break;
12079 }
12080
12081 return DAG.getNode(Opcode, SL, VTList,
12082 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12083 Flags);
12084}
12085
12086static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12087 EVT VT, SDValue A, SDValue B, SDValue C,
12088 SDValue GlueChain, SDNodeFlags Flags) {
12089 if (GlueChain->getNumValues() <= 1) {
12090 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12091 }
12092
12093 assert(GlueChain->getNumValues() == 3);
12094
12095 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12096 switch (Opcode) {
12097 default:
12098 llvm_unreachable("no chain equivalent for opcode");
12099 case ISD::FMA:
12100 Opcode = AMDGPUISD::FMA_W_CHAIN;
12101 break;
12102 }
12103
12104 return DAG.getNode(Opcode, SL, VTList,
12105 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12106 Flags);
12107}
12108
12109SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12110 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12111 return FastLowered;
12112
12113 SDLoc SL(Op);
12114 EVT VT = Op.getValueType();
12115 SDValue LHS = Op.getOperand(0);
12116 SDValue RHS = Op.getOperand(1);
12117
12118 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12119 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12120
12121 if (VT == MVT::bf16) {
12122 SDValue ExtDiv =
12123 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12124 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12125 DAG.getTargetConstant(0, SL, MVT::i32));
12126 }
12127
12128 assert(VT == MVT::f16);
12129
12130 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12131 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12132 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12133 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12134 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12135 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12136 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12137 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12138 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12139 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12140 // q16.u = opx(V_CVT_F16_F32, q32.u);
12141 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12142
12143 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12144 unsigned FMADOpCode =
12146 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12147 SDValue Rcp =
12148 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12149 SDValue Quot =
12150 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12151 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12152 Op->getFlags());
12153 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12154 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12155 Op->getFlags());
12156 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12157 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12158 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12159 DAG.getConstant(0xff800000, SL, MVT::i32));
12160 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12161 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12162 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12163 DAG.getTargetConstant(0, SL, MVT::i32));
12164 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12165 Op->getFlags());
12166}
12167
12168// Faster 2.5 ULP division that does not support denormals.
12169SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12170 SDNodeFlags Flags = Op->getFlags();
12171 SDLoc SL(Op);
12172 SDValue LHS = Op.getOperand(1);
12173 SDValue RHS = Op.getOperand(2);
12174
12175 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12176
12177 const APFloat K0Val(0x1p+96f);
12178 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12179
12180 const APFloat K1Val(0x1p-32f);
12181 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12182
12183 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12184
12185 EVT SetCCVT =
12186 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12187
12188 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12189
12190 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12191
12192 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12193
12194 // rcp does not support denormals.
12195 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12196
12197 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12198
12199 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12200}
12201
12202// Returns immediate value for setting the F32 denorm mode when using the
12203// S_DENORM_MODE instruction.
12206 const GCNSubtarget *ST) {
12207 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12208 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12209 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12210 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12211}
12212
12213SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12214 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12215 return FastLowered;
12216
12217 // The selection matcher assumes anything with a chain selecting to a
12218 // mayRaiseFPException machine instruction. Since we're introducing a chain
12219 // here, we need to explicitly report nofpexcept for the regular fdiv
12220 // lowering.
12221 SDNodeFlags Flags = Op->getFlags();
12222 Flags.setNoFPExcept(true);
12223
12224 SDLoc SL(Op);
12225 SDValue LHS = Op.getOperand(0);
12226 SDValue RHS = Op.getOperand(1);
12227
12228 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12229
12230 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12231
12232 SDValue DenominatorScaled =
12233 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12234 SDValue NumeratorScaled =
12235 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12236
12237 // Denominator is scaled to not be denormal, so using rcp is ok.
12238 SDValue ApproxRcp =
12239 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12240 SDValue NegDivScale0 =
12241 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12242
12243 using namespace AMDGPU::Hwreg;
12244 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12245 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12246
12247 const MachineFunction &MF = DAG.getMachineFunction();
12248 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12249 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12250
12251 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12252 const bool HasDynamicDenormals =
12253 (DenormMode.Input == DenormalMode::Dynamic) ||
12254 (DenormMode.Output == DenormalMode::Dynamic);
12255
12256 SDValue SavedDenormMode;
12257
12258 if (!PreservesDenormals) {
12259 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12260 // lowering. The chain dependence is insufficient, and we need glue. We do
12261 // not need the glue variants in a strictfp function.
12262
12263 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12264
12265 SDValue Glue = DAG.getEntryNode();
12266 if (HasDynamicDenormals) {
12267 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12268 DAG.getVTList(MVT::i32, MVT::Glue),
12269 {BitField, Glue});
12270 SavedDenormMode = SDValue(GetReg, 0);
12271
12272 Glue = DAG.getMergeValues(
12273 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12274 }
12275
12276 SDNode *EnableDenorm;
12277 if (Subtarget->hasDenormModeInst()) {
12278 const SDValue EnableDenormValue =
12280
12281 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12282 EnableDenormValue)
12283 .getNode();
12284 } else {
12285 const SDValue EnableDenormValue =
12286 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12287 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12288 {EnableDenormValue, BitField, Glue});
12289 }
12290
12291 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12292 SDValue(EnableDenorm, 1)};
12293
12294 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12295 }
12296
12297 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12298 ApproxRcp, One, NegDivScale0, Flags);
12299
12300 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12301 ApproxRcp, Fma0, Flags);
12302
12303 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12304 Fma1, Flags);
12305
12306 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12307 NumeratorScaled, Mul, Flags);
12308
12309 SDValue Fma3 =
12310 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12311
12312 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12313 NumeratorScaled, Fma3, Flags);
12314
12315 if (!PreservesDenormals) {
12316 SDNode *DisableDenorm;
12317 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12318 const SDValue DisableDenormValue = getSPDenormModeValue(
12319 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12320
12321 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12322 DisableDenorm =
12323 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12324 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12325 .getNode();
12326 } else {
12327 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12328 const SDValue DisableDenormValue =
12329 HasDynamicDenormals
12330 ? SavedDenormMode
12331 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12332
12333 DisableDenorm = DAG.getMachineNode(
12334 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12335 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12336 }
12337
12338 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12339 SDValue(DisableDenorm, 0), DAG.getRoot());
12340 DAG.setRoot(OutputChain);
12341 }
12342
12343 SDValue Scale = NumeratorScaled.getValue(1);
12344 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12345 {Fma4, Fma1, Fma3, Scale}, Flags);
12346
12347 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12348}
12349
12350SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12351 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12352 return FastLowered;
12353
12354 SDLoc SL(Op);
12355 SDValue X = Op.getOperand(0);
12356 SDValue Y = Op.getOperand(1);
12357
12358 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12359
12360 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12361
12362 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12363
12364 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12365
12366 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12367
12368 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12369
12370 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12371
12372 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12373
12374 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12375
12376 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12377 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12378
12379 SDValue Fma4 =
12380 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12381
12382 SDValue Scale;
12383
12384 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12385 // Workaround a hardware bug on SI where the condition output from div_scale
12386 // is not usable.
12387
12388 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12389
12390 // Figure out if the scale to use for div_fmas.
12391 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12392 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12393 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12394 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12395
12396 SDValue NumHi =
12397 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12398 SDValue DenHi =
12399 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12400
12401 SDValue Scale0Hi =
12402 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12403 SDValue Scale1Hi =
12404 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12405
12406 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12407 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12408 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12409 } else {
12410 Scale = DivScale1.getValue(1);
12411 }
12412
12413 SDValue Fmas =
12414 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12415
12416 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12417}
12418
12419SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12420 EVT VT = Op.getValueType();
12421
12422 if (VT == MVT::f32)
12423 return LowerFDIV32(Op, DAG);
12424
12425 if (VT == MVT::f64)
12426 return LowerFDIV64(Op, DAG);
12427
12428 if (VT == MVT::f16 || VT == MVT::bf16)
12429 return LowerFDIV16(Op, DAG);
12430
12431 llvm_unreachable("Unexpected type for fdiv");
12432}
12433
12434SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12435 SDLoc dl(Op);
12436 SDValue Val = Op.getOperand(0);
12437 EVT VT = Val.getValueType();
12438 EVT ResultExpVT = Op->getValueType(1);
12439 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12440
12441 SDValue Mant = DAG.getNode(
12443 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12444
12445 SDValue Exp = DAG.getNode(
12446 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12447 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12448
12449 if (Subtarget->hasFractBug()) {
12450 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12451 SDValue Inf =
12453
12454 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12455 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12456 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12457 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12458 }
12459
12460 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12461 return DAG.getMergeValues({Mant, CastExp}, dl);
12462}
12463
12464SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12465 SDLoc DL(Op);
12466 StoreSDNode *Store = cast<StoreSDNode>(Op);
12467 EVT VT = Store->getMemoryVT();
12468
12469 if (VT == MVT::i1) {
12470 return DAG.getTruncStore(
12471 Store->getChain(), DL,
12472 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12473 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12474 }
12475
12476 assert(VT.isVector() &&
12477 Store->getValue().getValueType().getScalarType() == MVT::i32);
12478
12479 unsigned AS = Store->getAddressSpace();
12480 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12481 Store->getAlign().value() < VT.getStoreSize() &&
12482 VT.getSizeInBits() > 32) {
12483 return SplitVectorStore(Op, DAG);
12484 }
12485
12486 MachineFunction &MF = DAG.getMachineFunction();
12487 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12488 // If there is a possibility that flat instruction access scratch memory
12489 // then we need to use the same legalization rules we use for private.
12490 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12491 !Subtarget->hasMultiDwordFlatScratchAddressing())
12492 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12495
12496 unsigned NumElements = VT.getVectorNumElements();
12498 if (NumElements > 4)
12499 return SplitVectorStore(Op, DAG);
12500 // v3 stores not supported on SI.
12501 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12502 return SplitVectorStore(Op, DAG);
12503
12505 VT, *Store->getMemOperand()))
12506 return expandUnalignedStore(Store, DAG);
12507
12508 return SDValue();
12509 }
12510 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12511 switch (Subtarget->getMaxPrivateElementSize()) {
12512 case 4:
12513 return scalarizeVectorStore(Store, DAG);
12514 case 8:
12515 if (NumElements > 2)
12516 return SplitVectorStore(Op, DAG);
12517 return SDValue();
12518 case 16:
12519 if (NumElements > 4 ||
12520 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12521 return SplitVectorStore(Op, DAG);
12522 return SDValue();
12523 default:
12524 llvm_unreachable("unsupported private_element_size");
12525 }
12526 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12527 unsigned Fast = 0;
12528 auto Flags = Store->getMemOperand()->getFlags();
12530 Store->getAlign(), Flags, &Fast) &&
12531 Fast > 1)
12532 return SDValue();
12533
12534 if (VT.isVector())
12535 return SplitVectorStore(Op, DAG);
12536
12537 return expandUnalignedStore(Store, DAG);
12538 }
12539
12540 // Probably an invalid store. If so we'll end up emitting a selection error.
12541 return SDValue();
12542}
12543
12544// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12545SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12546 SDLoc SL(Op);
12547 assert(!Subtarget->has16BitInsts());
12548 SDNodeFlags Flags = Op->getFlags();
12549 SDValue Ext =
12550 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12551
12552 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12553 SDValue Sqrt =
12554 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12555
12556 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12557 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12558}
12559
12560SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12561 SDLoc DL(Op);
12562 SDNodeFlags Flags = Op->getFlags();
12563 MVT VT = Op.getValueType().getSimpleVT();
12564 const SDValue X = Op.getOperand(0);
12565
12566 if (allowApproxFunc(DAG, Flags)) {
12567 // Instruction is 1ulp but ignores denormals.
12568 return DAG.getNode(
12570 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12571 }
12572
12573 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12574 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12575
12576 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12577
12578 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12579
12580 SDValue SqrtX =
12581 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12582
12583 SDValue SqrtS;
12584 if (needsDenormHandlingF32(DAG, X, Flags)) {
12585 SDValue SqrtID =
12586 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12587 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12588
12589 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12590 SDValue SqrtSNextDownInt =
12591 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12592 DAG.getAllOnesConstant(DL, MVT::i32));
12593 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12594
12595 SDValue NegSqrtSNextDown =
12596 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12597
12598 SDValue SqrtVP =
12599 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12600
12601 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12602 DAG.getConstant(1, DL, MVT::i32));
12603 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12604
12605 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12606 SDValue SqrtVS =
12607 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12608
12609 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12610 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12611
12612 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12613 Flags);
12614
12615 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12616 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12617 Flags);
12618 } else {
12619 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12620
12621 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12622
12623 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12624 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12625 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12626
12627 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12628 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12629 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12630
12631 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12632 SDValue SqrtD =
12633 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12634 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12635 }
12636
12637 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12638
12639 SDValue ScaledDown =
12640 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12641
12642 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12643 SDValue IsZeroOrInf =
12644 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12645 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12646
12647 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12648}
12649
12650SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12651 // For double type, the SQRT and RSQ instructions don't have required
12652 // precision, we apply Goldschmidt's algorithm to improve the result:
12653 //
12654 // y0 = rsq(x)
12655 // g0 = x * y0
12656 // h0 = 0.5 * y0
12657 //
12658 // r0 = 0.5 - h0 * g0
12659 // g1 = g0 * r0 + g0
12660 // h1 = h0 * r0 + h0
12661 //
12662 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12663 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12664 // h2 = h1 * r1 + h1
12665 //
12666 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12667 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12668 //
12669 // sqrt(x) = g3
12670
12671 SDNodeFlags Flags = Op->getFlags();
12672
12673 SDLoc DL(Op);
12674
12675 SDValue X = Op.getOperand(0);
12676 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12677
12678 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12679
12680 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12681
12682 // Scale up input if it is too small.
12683 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12684 SDValue ScaleUp =
12685 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12686 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12687
12688 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12689
12690 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12691
12692 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12693 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12694
12695 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12696 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12697
12698 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12699
12700 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12701
12702 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12703 SDValue SqrtD0 =
12704 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12705
12706 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12707
12708 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12709 SDValue SqrtD1 =
12710 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12711
12712 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12713
12714 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12715 SDValue ScaleDown =
12716 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12717 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12718
12719 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12720 // with finite only or nsz because rsq(+/-0) = +/-inf
12721
12722 // TODO: Check for DAZ and expand to subnormals
12723 SDValue IsZeroOrInf =
12724 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12725 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12726
12727 // If x is +INF, +0, or -0, use its original value
12728 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12729 Flags);
12730}
12731
12732SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12733 SDLoc DL(Op);
12734 EVT VT = Op.getValueType();
12735 SDValue Arg = Op.getOperand(0);
12736 SDValue TrigVal;
12737
12738 // Propagate fast-math flags so that the multiply we introduce can be folded
12739 // if Arg is already the result of a multiply by constant.
12740 auto Flags = Op->getFlags();
12741
12742 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12743
12744 if (Subtarget->hasTrigReducedRange()) {
12745 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12746 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12747 } else {
12748 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12749 }
12750
12751 switch (Op.getOpcode()) {
12752 case ISD::FCOS:
12753 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12754 case ISD::FSIN:
12755 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12756 default:
12757 llvm_unreachable("Wrong trig opcode");
12758 }
12759}
12760
12761SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12762 SelectionDAG &DAG) const {
12763 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12764 assert(AtomicNode->isCompareAndSwap());
12765 unsigned AS = AtomicNode->getAddressSpace();
12766
12767 // No custom lowering required for local address space
12769 return Op;
12770
12771 // Non-local address space requires custom lowering for atomic compare
12772 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12773 SDLoc DL(Op);
12774 SDValue ChainIn = Op.getOperand(0);
12775 SDValue Addr = Op.getOperand(1);
12776 SDValue Old = Op.getOperand(2);
12777 SDValue New = Op.getOperand(3);
12778 EVT VT = Op.getValueType();
12779 MVT SimpleVT = VT.getSimpleVT();
12780 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12781
12782 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12783 SDValue Ops[] = {ChainIn, Addr, NewOld};
12784
12786 Op->getVTList(), Ops, VT,
12787 AtomicNode->getMemOperand());
12788}
12789
12790//===----------------------------------------------------------------------===//
12791// Custom DAG optimizations
12792//===----------------------------------------------------------------------===//
12793
12794SDValue
12795SITargetLowering::performUCharToFloatCombine(SDNode *N,
12796 DAGCombinerInfo &DCI) const {
12797 EVT VT = N->getValueType(0);
12798 EVT ScalarVT = VT.getScalarType();
12799 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12800 return SDValue();
12801
12802 SelectionDAG &DAG = DCI.DAG;
12803 SDLoc DL(N);
12804
12805 SDValue Src = N->getOperand(0);
12806 EVT SrcVT = Src.getValueType();
12807
12808 // TODO: We could try to match extracting the higher bytes, which would be
12809 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12810 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12811 // about in practice.
12812 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12813 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12814 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12815 DCI.AddToWorklist(Cvt.getNode());
12816
12817 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12818 if (ScalarVT != MVT::f32) {
12819 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12820 DAG.getTargetConstant(0, DL, MVT::i32));
12821 }
12822 return Cvt;
12823 }
12824 }
12825
12826 return SDValue();
12827}
12828
12829SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12830 DAGCombinerInfo &DCI) const {
12831 SDValue MagnitudeOp = N->getOperand(0);
12832 SDValue SignOp = N->getOperand(1);
12833
12834 // The generic combine for fcopysign + fp cast is too conservative with
12835 // vectors, and also gets confused by the splitting we will perform here, so
12836 // peek through FP casts.
12837 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12838 SignOp.getOpcode() == ISD::FP_ROUND)
12839 SignOp = SignOp.getOperand(0);
12840
12841 SelectionDAG &DAG = DCI.DAG;
12842 SDLoc DL(N);
12843 EVT SignVT = SignOp.getValueType();
12844
12845 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12846 // lower half with a copy.
12847 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12848 EVT MagVT = MagnitudeOp.getValueType();
12849
12850 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12851
12852 if (MagVT.getScalarType() == MVT::f64) {
12853 EVT F32VT = MagVT.isVector()
12854 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12855 : MVT::v2f32;
12856
12857 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12858
12860 for (unsigned I = 0; I != NumElts; ++I) {
12861 SDValue MagLo =
12862 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12863 DAG.getConstant(2 * I, DL, MVT::i32));
12864 SDValue MagHi =
12865 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12866 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12867
12868 SDValue SignOpElt =
12869 MagVT.isVector()
12871 SignOp, DAG.getConstant(I, DL, MVT::i32))
12872 : SignOp;
12873
12874 SDValue HiOp =
12875 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12876
12877 SDValue Vector =
12878 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12879
12880 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12881 NewElts.push_back(NewElt);
12882 }
12883
12884 if (NewElts.size() == 1)
12885 return NewElts[0];
12886
12887 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12888 }
12889
12890 if (SignVT.getScalarType() != MVT::f64)
12891 return SDValue();
12892
12893 // Reduce width of sign operand, we only need the highest bit.
12894 //
12895 // fcopysign f64:x, f64:y ->
12896 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12897 // TODO: In some cases it might make sense to go all the way to f16.
12898
12899 EVT F32VT = MagVT.isVector()
12900 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12901 : MVT::v2f32;
12902
12903 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12904
12905 SmallVector<SDValue, 8> F32Signs;
12906 for (unsigned I = 0; I != NumElts; ++I) {
12907 // Take sign from odd elements of cast vector
12908 SDValue SignAsF32 =
12909 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12910 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12911 F32Signs.push_back(SignAsF32);
12912 }
12913
12914 SDValue NewSign =
12915 NumElts == 1
12916 ? F32Signs.back()
12918 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12919 F32Signs);
12920
12921 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12922 NewSign);
12923}
12924
12925// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12926// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12927// bits
12928
12929// This is a variant of
12930// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12931//
12932// The normal DAG combiner will do this, but only if the add has one use since
12933// that would increase the number of instructions.
12934//
12935// This prevents us from seeing a constant offset that can be folded into a
12936// memory instruction's addressing mode. If we know the resulting add offset of
12937// a pointer can be folded into an addressing offset, we can replace the pointer
12938// operand with the add of new constant offset. This eliminates one of the uses,
12939// and may allow the remaining use to also be simplified.
12940//
12941SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12942 EVT MemVT,
12943 DAGCombinerInfo &DCI) const {
12944 SDValue N0 = N->getOperand(0);
12945 SDValue N1 = N->getOperand(1);
12946
12947 // We only do this to handle cases where it's profitable when there are
12948 // multiple uses of the add, so defer to the standard combine.
12949 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
12950 return SDValue();
12951
12952 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12953 if (!CN1)
12954 return SDValue();
12955
12956 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12957 if (!CAdd)
12958 return SDValue();
12959
12960 SelectionDAG &DAG = DCI.DAG;
12961
12962 if (N0->getOpcode() == ISD::OR &&
12963 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12964 return SDValue();
12965
12966 // If the resulting offset is too large, we can't fold it into the
12967 // addressing mode offset.
12968 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12969 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12970
12971 AddrMode AM;
12972 AM.HasBaseReg = true;
12973 AM.BaseOffs = Offset.getSExtValue();
12974 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12975 return SDValue();
12976
12977 SDLoc SL(N);
12978 EVT VT = N->getValueType(0);
12979
12980 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12981 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12982
12983 SDNodeFlags Flags;
12984 Flags.setNoUnsignedWrap(
12985 N->getFlags().hasNoUnsignedWrap() &&
12986 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12987
12988 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
12989 // be sure that the new left operand is a proper base pointer.
12990 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12991}
12992
12993/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12994/// by the chain and intrinsic ID. Theoretically we would also need to check the
12995/// specific intrinsic, but they all place the pointer operand first.
12996static unsigned getBasePtrIndex(const MemSDNode *N) {
12997 switch (N->getOpcode()) {
12998 case ISD::STORE:
13001 return 2;
13002 default:
13003 return 1;
13004 }
13005}
13006
13007SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
13008 DAGCombinerInfo &DCI) const {
13009 SelectionDAG &DAG = DCI.DAG;
13010
13011 unsigned PtrIdx = getBasePtrIndex(N);
13012 SDValue Ptr = N->getOperand(PtrIdx);
13013
13014 // TODO: We could also do this for multiplies.
13015 if (Ptr.getOpcode() == ISD::SHL) {
13016 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13017 N->getMemoryVT(), DCI);
13018 if (NewPtr) {
13019 SmallVector<SDValue, 8> NewOps(N->ops());
13020
13021 NewOps[PtrIdx] = NewPtr;
13022 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13023 }
13024 }
13025
13026 return SDValue();
13027}
13028
13029static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13030 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13031 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13032 (Opc == ISD::XOR && Val == 0);
13033}
13034
13035// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13036// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13037// integer combine opportunities since most 64-bit operations are decomposed
13038// this way. TODO: We won't want this for SALU especially if it is an inline
13039// immediate.
13040SDValue SITargetLowering::splitBinaryBitConstantOp(
13041 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13042 const ConstantSDNode *CRHS) const {
13043 uint64_t Val = CRHS->getZExtValue();
13044 uint32_t ValLo = Lo_32(Val);
13045 uint32_t ValHi = Hi_32(Val);
13046 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13047
13048 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13050 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13051 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13052 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13053 !CRHS->user_begin()->isDivergent())
13054 return SDValue();
13055
13056 // If we need to materialize a 64-bit immediate, it will be split up later
13057 // anyway. Avoid creating the harder to understand 64-bit immediate
13058 // materialization.
13059 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13060 }
13061
13062 return SDValue();
13063}
13064
13066 if (V.getValueType() != MVT::i1)
13067 return false;
13068 switch (V.getOpcode()) {
13069 default:
13070 break;
13071 case ISD::SETCC:
13072 case ISD::IS_FPCLASS:
13074 return true;
13075 case ISD::AND:
13076 case ISD::OR:
13077 case ISD::XOR:
13078 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13079 case ISD::SADDO:
13080 case ISD::UADDO:
13081 case ISD::SSUBO:
13082 case ISD::USUBO:
13083 case ISD::SMULO:
13084 case ISD::UMULO:
13085 return V.getResNo() == 1;
13087 unsigned IntrinsicID = V.getConstantOperandVal(0);
13088 switch (IntrinsicID) {
13089 case Intrinsic::amdgcn_is_shared:
13090 case Intrinsic::amdgcn_is_private:
13091 return true;
13092 default:
13093 return false;
13094 }
13095
13096 return false;
13097 }
13098 }
13099 return false;
13100}
13101
13102// If a constant has all zeroes or all ones within each byte return it.
13103// Otherwise return 0.
13105 // 0xff for any zero byte in the mask
13106 uint32_t ZeroByteMask = 0;
13107 if (!(C & 0x000000ff))
13108 ZeroByteMask |= 0x000000ff;
13109 if (!(C & 0x0000ff00))
13110 ZeroByteMask |= 0x0000ff00;
13111 if (!(C & 0x00ff0000))
13112 ZeroByteMask |= 0x00ff0000;
13113 if (!(C & 0xff000000))
13114 ZeroByteMask |= 0xff000000;
13115 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13116 if ((NonZeroByteMask & C) != NonZeroByteMask)
13117 return 0; // Partial bytes selected.
13118 return C;
13119}
13120
13121// Check if a node selects whole bytes from its operand 0 starting at a byte
13122// boundary while masking the rest. Returns select mask as in the v_perm_b32
13123// or -1 if not succeeded.
13124// Note byte select encoding:
13125// value 0-3 selects corresponding source byte;
13126// value 0xc selects zero;
13127// value 0xff selects 0xff.
13129 assert(V.getValueSizeInBits() == 32);
13130
13131 if (V.getNumOperands() != 2)
13132 return ~0;
13133
13134 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13135 if (!N1)
13136 return ~0;
13137
13138 uint32_t C = N1->getZExtValue();
13139
13140 switch (V.getOpcode()) {
13141 default:
13142 break;
13143 case ISD::AND:
13144 if (uint32_t ConstMask = getConstantPermuteMask(C))
13145 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13146 break;
13147
13148 case ISD::OR:
13149 if (uint32_t ConstMask = getConstantPermuteMask(C))
13150 return (0x03020100 & ~ConstMask) | ConstMask;
13151 break;
13152
13153 case ISD::SHL:
13154 if (C % 8)
13155 return ~0;
13156
13157 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13158
13159 case ISD::SRL:
13160 if (C % 8)
13161 return ~0;
13162
13163 return uint32_t(0x0c0c0c0c03020100ull >> C);
13164 }
13165
13166 return ~0;
13167}
13168
13169SDValue SITargetLowering::performAndCombine(SDNode *N,
13170 DAGCombinerInfo &DCI) const {
13171 if (DCI.isBeforeLegalize())
13172 return SDValue();
13173
13174 SelectionDAG &DAG = DCI.DAG;
13175 EVT VT = N->getValueType(0);
13176 SDValue LHS = N->getOperand(0);
13177 SDValue RHS = N->getOperand(1);
13178
13179 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13180 if (VT == MVT::i64 && CRHS) {
13181 if (SDValue Split =
13182 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13183 return Split;
13184 }
13185
13186 if (CRHS && VT == MVT::i32) {
13187 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13188 // nb = number of trailing zeroes in mask
13189 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13190 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13191 uint64_t Mask = CRHS->getZExtValue();
13192 unsigned Bits = llvm::popcount(Mask);
13193 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13194 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13195 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13196 unsigned Shift = CShift->getZExtValue();
13197 unsigned NB = CRHS->getAPIntValue().countr_zero();
13198 unsigned Offset = NB + Shift;
13199 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13200 SDLoc SL(N);
13201 SDValue BFE =
13202 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13203 DAG.getConstant(Offset, SL, MVT::i32),
13204 DAG.getConstant(Bits, SL, MVT::i32));
13205 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13206 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13207 DAG.getValueType(NarrowVT));
13208 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13209 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13210 return Shl;
13211 }
13212 }
13213 }
13214
13215 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13216 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13217 isa<ConstantSDNode>(LHS.getOperand(2))) {
13218 uint32_t Sel = getConstantPermuteMask(Mask);
13219 if (!Sel)
13220 return SDValue();
13221
13222 // Select 0xc for all zero bytes
13223 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13224 SDLoc DL(N);
13225 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13226 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13227 }
13228 }
13229
13230 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13231 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13232 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13233 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13234 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13235
13236 SDValue X = LHS.getOperand(0);
13237 SDValue Y = RHS.getOperand(0);
13238 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13239 !isTypeLegal(X.getValueType()))
13240 return SDValue();
13241
13242 if (LCC == ISD::SETO) {
13243 if (X != LHS.getOperand(1))
13244 return SDValue();
13245
13246 if (RCC == ISD::SETUNE) {
13247 const ConstantFPSDNode *C1 =
13248 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13249 if (!C1 || !C1->isInfinity() || C1->isNegative())
13250 return SDValue();
13251
13252 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13256
13257 static_assert(
13260 0x3ff) == Mask,
13261 "mask not equal");
13262
13263 SDLoc DL(N);
13264 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13265 DAG.getConstant(Mask, DL, MVT::i32));
13266 }
13267 }
13268 }
13269
13270 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13271 std::swap(LHS, RHS);
13272
13273 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13274 RHS.hasOneUse()) {
13275 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13276 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13277 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13278 // | n_nan)
13279 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13280 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13281 (RHS.getOperand(0) == LHS.getOperand(0) &&
13282 LHS.getOperand(0) == LHS.getOperand(1))) {
13283 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13284 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13285 : Mask->getZExtValue() & OrdMask;
13286
13287 SDLoc DL(N);
13288 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13289 DAG.getConstant(NewMask, DL, MVT::i32));
13290 }
13291 }
13292
13293 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13294 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13295 // and x, (sext cc from i1) => select cc, x, 0
13296 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13297 std::swap(LHS, RHS);
13298 if (isBoolSGPR(RHS.getOperand(0)))
13299 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13300 DAG.getConstant(0, SDLoc(N), MVT::i32));
13301 }
13302
13303 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13304 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13305 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13306 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13307 uint32_t LHSMask = getPermuteMask(LHS);
13308 uint32_t RHSMask = getPermuteMask(RHS);
13309 if (LHSMask != ~0u && RHSMask != ~0u) {
13310 // Canonicalize the expression in an attempt to have fewer unique masks
13311 // and therefore fewer registers used to hold the masks.
13312 if (LHSMask > RHSMask) {
13313 std::swap(LHSMask, RHSMask);
13314 std::swap(LHS, RHS);
13315 }
13316
13317 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13318 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13319 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13320 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13321
13322 // Check of we need to combine values from two sources within a byte.
13323 if (!(LHSUsedLanes & RHSUsedLanes) &&
13324 // If we select high and lower word keep it for SDWA.
13325 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13326 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13327 // Each byte in each mask is either selector mask 0-3, or has higher
13328 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13329 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13330 // mask which is not 0xff wins. By anding both masks we have a correct
13331 // result except that 0x0c shall be corrected to give 0x0c only.
13332 uint32_t Mask = LHSMask & RHSMask;
13333 for (unsigned I = 0; I < 32; I += 8) {
13334 uint32_t ByteSel = 0xff << I;
13335 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13336 Mask &= (0x0c << I) & 0xffffffff;
13337 }
13338
13339 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13340 // or 0x0c.
13341 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13342 SDLoc DL(N);
13343
13344 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13345 RHS.getOperand(0),
13346 DAG.getConstant(Sel, DL, MVT::i32));
13347 }
13348 }
13349 }
13350
13351 return SDValue();
13352}
13353
13354// A key component of v_perm is a mapping between byte position of the src
13355// operands, and the byte position of the dest. To provide such, we need: 1. the
13356// node that provides x byte of the dest of the OR, and 2. the byte of the node
13357// used to provide that x byte. calculateByteProvider finds which node provides
13358// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13359// and finds an ultimate src and byte position For example: The supported
13360// LoadCombine pattern for vector loads is as follows
13361// t1
13362// or
13363// / \
13364// t2 t3
13365// zext shl
13366// | | \
13367// t4 t5 16
13368// or anyext
13369// / \ |
13370// t6 t7 t8
13371// srl shl or
13372// / | / \ / \
13373// t9 t10 t11 t12 t13 t14
13374// trunc* 8 trunc* 8 and and
13375// | | / | | \
13376// t15 t16 t17 t18 t19 t20
13377// trunc* 255 srl -256
13378// | / \
13379// t15 t15 16
13380//
13381// *In this example, the truncs are from i32->i16
13382//
13383// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13384// respectively. calculateSrcByte would find (given node) -> ultimate src &
13385// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13386// After finding the mapping, we can combine the tree into vperm t15, t16,
13387// 0x05000407
13388
13389// Find the source and byte position from a node.
13390// \p DestByte is the byte position of the dest of the or that the src
13391// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13392// dest of the or byte. \p Depth tracks how many recursive iterations we have
13393// performed.
13394static const std::optional<ByteProvider<SDValue>>
13395calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13396 unsigned Depth = 0) {
13397 // We may need to recursively traverse a series of SRLs
13398 if (Depth >= 6)
13399 return std::nullopt;
13400
13401 if (Op.getValueSizeInBits() < 8)
13402 return std::nullopt;
13403
13404 if (Op.getValueType().isVector())
13405 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13406
13407 switch (Op->getOpcode()) {
13408 case ISD::TRUNCATE: {
13409 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13410 }
13411
13412 case ISD::SIGN_EXTEND:
13413 case ISD::ZERO_EXTEND:
13415 SDValue NarrowOp = Op->getOperand(0);
13416 auto NarrowVT = NarrowOp.getValueType();
13417 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13418 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13419 NarrowVT = VTSign->getVT();
13420 }
13421 if (!NarrowVT.isByteSized())
13422 return std::nullopt;
13423 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13424
13425 if (SrcIndex >= NarrowByteWidth)
13426 return std::nullopt;
13427 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13428 }
13429
13430 case ISD::SRA:
13431 case ISD::SRL: {
13432 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13433 if (!ShiftOp)
13434 return std::nullopt;
13435
13436 uint64_t BitShift = ShiftOp->getZExtValue();
13437
13438 if (BitShift % 8 != 0)
13439 return std::nullopt;
13440
13441 SrcIndex += BitShift / 8;
13442
13443 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13444 }
13445
13446 default: {
13447 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13448 }
13449 }
13450 llvm_unreachable("fully handled switch");
13451}
13452
13453// For a byte position in the result of an Or, traverse the tree and find the
13454// node (and the byte of the node) which ultimately provides this {Or,
13455// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13456// the byte position of the Op that corresponds with the originally requested
13457// byte of the Or \p Depth tracks how many recursive iterations we have
13458// performed. \p StartingIndex is the originally requested byte of the Or
13459static const std::optional<ByteProvider<SDValue>>
13460calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13461 unsigned StartingIndex = 0) {
13462 // Finding Src tree of RHS of or typically requires at least 1 additional
13463 // depth
13464 if (Depth > 6)
13465 return std::nullopt;
13466
13467 unsigned BitWidth = Op.getScalarValueSizeInBits();
13468 if (BitWidth % 8 != 0)
13469 return std::nullopt;
13470 if (Index > BitWidth / 8 - 1)
13471 return std::nullopt;
13472
13473 bool IsVec = Op.getValueType().isVector();
13474 switch (Op.getOpcode()) {
13475 case ISD::OR: {
13476 if (IsVec)
13477 return std::nullopt;
13478
13479 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13480 StartingIndex);
13481 if (!RHS)
13482 return std::nullopt;
13483 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13484 StartingIndex);
13485 if (!LHS)
13486 return std::nullopt;
13487 // A well formed Or will have two ByteProviders for each byte, one of which
13488 // is constant zero
13489 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13490 return std::nullopt;
13491 if (!LHS || LHS->isConstantZero())
13492 return RHS;
13493 if (!RHS || RHS->isConstantZero())
13494 return LHS;
13495 return std::nullopt;
13496 }
13497
13498 case ISD::AND: {
13499 if (IsVec)
13500 return std::nullopt;
13501
13502 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13503 if (!BitMaskOp)
13504 return std::nullopt;
13505
13506 uint32_t BitMask = BitMaskOp->getZExtValue();
13507 // Bits we expect for our StartingIndex
13508 uint32_t IndexMask = 0xFF << (Index * 8);
13509
13510 if ((IndexMask & BitMask) != IndexMask) {
13511 // If the result of the and partially provides the byte, then it
13512 // is not well formatted
13513 if (IndexMask & BitMask)
13514 return std::nullopt;
13516 }
13517
13518 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13519 }
13520
13521 case ISD::FSHR: {
13522 if (IsVec)
13523 return std::nullopt;
13524
13525 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13526 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13527 if (!ShiftOp || Op.getValueType().isVector())
13528 return std::nullopt;
13529
13530 uint64_t BitsProvided = Op.getValueSizeInBits();
13531 if (BitsProvided % 8 != 0)
13532 return std::nullopt;
13533
13534 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13535 if (BitShift % 8)
13536 return std::nullopt;
13537
13538 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13539 uint64_t ByteShift = BitShift / 8;
13540
13541 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13542 uint64_t BytesProvided = BitsProvided / 8;
13543 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13544 NewIndex %= BytesProvided;
13545 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13546 }
13547
13548 case ISD::SRA:
13549 case ISD::SRL: {
13550 if (IsVec)
13551 return std::nullopt;
13552
13553 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13554 if (!ShiftOp)
13555 return std::nullopt;
13556
13557 uint64_t BitShift = ShiftOp->getZExtValue();
13558 if (BitShift % 8)
13559 return std::nullopt;
13560
13561 auto BitsProvided = Op.getScalarValueSizeInBits();
13562 if (BitsProvided % 8 != 0)
13563 return std::nullopt;
13564
13565 uint64_t BytesProvided = BitsProvided / 8;
13566 uint64_t ByteShift = BitShift / 8;
13567 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13568 // If the byte we are trying to provide (as tracked by index) falls in this
13569 // range, then the SRL provides the byte. The byte of interest of the src of
13570 // the SRL is Index + ByteShift
13571 return BytesProvided - ByteShift > Index
13572 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13573 Index + ByteShift)
13575 }
13576
13577 case ISD::SHL: {
13578 if (IsVec)
13579 return std::nullopt;
13580
13581 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13582 if (!ShiftOp)
13583 return std::nullopt;
13584
13585 uint64_t BitShift = ShiftOp->getZExtValue();
13586 if (BitShift % 8 != 0)
13587 return std::nullopt;
13588 uint64_t ByteShift = BitShift / 8;
13589
13590 // If we are shifting by an amount greater than (or equal to)
13591 // the index we are trying to provide, then it provides 0s. If not,
13592 // then this bytes are not definitively 0s, and the corresponding byte
13593 // of interest is Index - ByteShift of the src
13594 return Index < ByteShift
13596 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13597 Depth + 1, StartingIndex);
13598 }
13599 case ISD::ANY_EXTEND:
13600 case ISD::SIGN_EXTEND:
13601 case ISD::ZERO_EXTEND:
13603 case ISD::AssertZext:
13604 case ISD::AssertSext: {
13605 if (IsVec)
13606 return std::nullopt;
13607
13608 SDValue NarrowOp = Op->getOperand(0);
13609 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13610 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13611 Op->getOpcode() == ISD::AssertZext ||
13612 Op->getOpcode() == ISD::AssertSext) {
13613 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13614 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13615 }
13616 if (NarrowBitWidth % 8 != 0)
13617 return std::nullopt;
13618 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13619
13620 if (Index >= NarrowByteWidth)
13621 return Op.getOpcode() == ISD::ZERO_EXTEND
13622 ? std::optional<ByteProvider<SDValue>>(
13624 : std::nullopt;
13625 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13626 }
13627
13628 case ISD::TRUNCATE: {
13629 if (IsVec)
13630 return std::nullopt;
13631
13632 uint64_t NarrowByteWidth = BitWidth / 8;
13633
13634 if (NarrowByteWidth >= Index) {
13635 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13636 StartingIndex);
13637 }
13638
13639 return std::nullopt;
13640 }
13641
13642 case ISD::CopyFromReg: {
13643 if (BitWidth / 8 > Index)
13644 return calculateSrcByte(Op, StartingIndex, Index);
13645
13646 return std::nullopt;
13647 }
13648
13649 case ISD::LOAD: {
13650 auto *L = cast<LoadSDNode>(Op.getNode());
13651
13652 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13653 if (NarrowBitWidth % 8 != 0)
13654 return std::nullopt;
13655 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13656
13657 // If the width of the load does not reach byte we are trying to provide for
13658 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13659 // question
13660 if (Index >= NarrowByteWidth) {
13661 return L->getExtensionType() == ISD::ZEXTLOAD
13662 ? std::optional<ByteProvider<SDValue>>(
13664 : std::nullopt;
13665 }
13666
13667 if (NarrowByteWidth > Index) {
13668 return calculateSrcByte(Op, StartingIndex, Index);
13669 }
13670
13671 return std::nullopt;
13672 }
13673
13674 case ISD::BSWAP: {
13675 if (IsVec)
13676 return std::nullopt;
13677
13678 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13679 Depth + 1, StartingIndex);
13680 }
13681
13683 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13684 if (!IdxOp)
13685 return std::nullopt;
13686 auto VecIdx = IdxOp->getZExtValue();
13687 auto ScalarSize = Op.getScalarValueSizeInBits();
13688 if (ScalarSize < 32)
13689 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13690 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13691 StartingIndex, Index);
13692 }
13693
13694 case AMDGPUISD::PERM: {
13695 if (IsVec)
13696 return std::nullopt;
13697
13698 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13699 if (!PermMask)
13700 return std::nullopt;
13701
13702 auto IdxMask =
13703 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13704 if (IdxMask > 0x07 && IdxMask != 0x0c)
13705 return std::nullopt;
13706
13707 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13708 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13709
13710 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13713 }
13714
13715 default: {
13716 return std::nullopt;
13717 }
13718 }
13719
13720 llvm_unreachable("fully handled switch");
13721}
13722
13723// Returns true if the Operand is a scalar and is 16 bits
13724static bool isExtendedFrom16Bits(SDValue &Operand) {
13725
13726 switch (Operand.getOpcode()) {
13727 case ISD::ANY_EXTEND:
13728 case ISD::SIGN_EXTEND:
13729 case ISD::ZERO_EXTEND: {
13730 auto OpVT = Operand.getOperand(0).getValueType();
13731 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13732 }
13733 case ISD::LOAD: {
13734 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13735 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13736 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13737 ExtType == ISD::EXTLOAD) {
13738 auto MemVT = L->getMemoryVT();
13739 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13740 }
13741 return L->getMemoryVT().getSizeInBits() == 16;
13742 }
13743 default:
13744 return false;
13745 }
13746}
13747
13748// Returns true if the mask matches consecutive bytes, and the first byte
13749// begins at a power of 2 byte offset from 0th byte
13750static bool addresses16Bits(int Mask) {
13751 int Low8 = Mask & 0xff;
13752 int Hi8 = (Mask & 0xff00) >> 8;
13753
13754 assert(Low8 < 8 && Hi8 < 8);
13755 // Are the bytes contiguous in the order of increasing addresses.
13756 bool IsConsecutive = (Hi8 - Low8 == 1);
13757 // Is the first byte at location that is aligned for 16 bit instructions.
13758 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13759 // In this case, we still need code to extract the 16 bit operand, so it
13760 // is better to use i8 v_perm
13761 bool Is16Aligned = !(Low8 % 2);
13762
13763 return IsConsecutive && Is16Aligned;
13764}
13765
13766// Do not lower into v_perm if the operands are actually 16 bit
13767// and the selected bits (based on PermMask) correspond with two
13768// easily addressable 16 bit operands.
13770 SDValue &OtherOp) {
13771 int Low16 = PermMask & 0xffff;
13772 int Hi16 = (PermMask & 0xffff0000) >> 16;
13773
13774 auto TempOp = peekThroughBitcasts(Op);
13775 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13776
13777 auto OpIs16Bit =
13778 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13779 if (!OpIs16Bit)
13780 return true;
13781
13782 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13783 isExtendedFrom16Bits(TempOtherOp);
13784 if (!OtherOpIs16Bit)
13785 return true;
13786
13787 // Do we cleanly address both
13788 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13789}
13790
13792 unsigned DWordOffset) {
13793 SDValue Ret;
13794
13795 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13796 // ByteProvider must be at least 8 bits
13797 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13798
13799 if (TypeSize <= 32)
13800 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13801
13802 if (Src.getValueType().isVector()) {
13803 auto ScalarTySize = Src.getScalarValueSizeInBits();
13804 auto ScalarTy = Src.getValueType().getScalarType();
13805 if (ScalarTySize == 32) {
13806 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13807 DAG.getConstant(DWordOffset, SL, MVT::i32));
13808 }
13809 if (ScalarTySize > 32) {
13810 Ret = DAG.getNode(
13811 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13812 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13813 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13814 if (ShiftVal)
13815 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13816 DAG.getConstant(ShiftVal, SL, MVT::i32));
13817 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13818 }
13819
13820 assert(ScalarTySize < 32);
13821 auto NumElements = TypeSize / ScalarTySize;
13822 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13823 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13824 auto NumElementsIn32 = 32 / ScalarTySize;
13825 auto NumAvailElements = DWordOffset < Trunc32Elements
13826 ? NumElementsIn32
13827 : NumElements - NormalizedTrunc;
13828
13830 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13831 NumAvailElements);
13832
13833 Ret = DAG.getBuildVector(
13834 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13835 VecSrcs);
13836 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13837 }
13838
13839 /// Scalar Type
13840 auto ShiftVal = 32 * DWordOffset;
13841 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13842 DAG.getConstant(ShiftVal, SL, MVT::i32));
13843 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13844}
13845
13847 SelectionDAG &DAG = DCI.DAG;
13848 [[maybe_unused]] EVT VT = N->getValueType(0);
13850
13851 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13852 assert(VT == MVT::i32);
13853 for (int i = 0; i < 4; i++) {
13854 // Find the ByteProvider that provides the ith byte of the result of OR
13855 std::optional<ByteProvider<SDValue>> P =
13856 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13857 // TODO support constantZero
13858 if (!P || P->isConstantZero())
13859 return SDValue();
13860
13861 PermNodes.push_back(*P);
13862 }
13863 if (PermNodes.size() != 4)
13864 return SDValue();
13865
13866 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13867 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13868 uint64_t PermMask = 0x00000000;
13869 for (size_t i = 0; i < PermNodes.size(); i++) {
13870 auto PermOp = PermNodes[i];
13871 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13872 // by sizeof(Src2) = 4
13873 int SrcByteAdjust = 4;
13874
13875 // If the Src uses a byte from a different DWORD, then it corresponds
13876 // with a difference source
13877 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13878 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13879 if (SecondSrc)
13880 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13881 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13882 return SDValue();
13883
13884 // Set the index of the second distinct Src node
13885 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13886 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13887 SrcByteAdjust = 0;
13888 }
13889 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13891 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13892 }
13893 SDLoc DL(N);
13894 SDValue Op = *PermNodes[FirstSrc.first].Src;
13895 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13896 assert(Op.getValueSizeInBits() == 32);
13897
13898 // Check that we are not just extracting the bytes in order from an op
13899 if (!SecondSrc) {
13900 int Low16 = PermMask & 0xffff;
13901 int Hi16 = (PermMask & 0xffff0000) >> 16;
13902
13903 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13904 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13905
13906 // The perm op would really just produce Op. So combine into Op
13907 if (WellFormedLow && WellFormedHi)
13908 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13909 }
13910
13911 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13912
13913 if (SecondSrc) {
13914 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13915 assert(OtherOp.getValueSizeInBits() == 32);
13916 }
13917
13918 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13919
13920 assert(Op.getValueType().isByteSized() &&
13921 OtherOp.getValueType().isByteSized());
13922
13923 // If the ultimate src is less than 32 bits, then we will only be
13924 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13925 // CalculateByteProvider would not have returned Op as source if we
13926 // used a byte that is outside its ValueType. Thus, we are free to
13927 // ANY_EXTEND as the extended bits are dont-cares.
13928 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13929 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13930
13931 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13932 DAG.getConstant(PermMask, DL, MVT::i32));
13933 }
13934 return SDValue();
13935}
13936
13937SDValue SITargetLowering::performOrCombine(SDNode *N,
13938 DAGCombinerInfo &DCI) const {
13939 SelectionDAG &DAG = DCI.DAG;
13940 SDValue LHS = N->getOperand(0);
13941 SDValue RHS = N->getOperand(1);
13942
13943 EVT VT = N->getValueType(0);
13944 if (VT == MVT::i1) {
13945 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13946 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13947 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13948 SDValue Src = LHS.getOperand(0);
13949 if (Src != RHS.getOperand(0))
13950 return SDValue();
13951
13952 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13953 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13954 if (!CLHS || !CRHS)
13955 return SDValue();
13956
13957 // Only 10 bits are used.
13958 static const uint32_t MaxMask = 0x3ff;
13959
13960 uint32_t NewMask =
13961 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13962 SDLoc DL(N);
13963 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13964 DAG.getConstant(NewMask, DL, MVT::i32));
13965 }
13966
13967 return SDValue();
13968 }
13969
13970 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13972 LHS.getOpcode() == AMDGPUISD::PERM &&
13973 isa<ConstantSDNode>(LHS.getOperand(2))) {
13974 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13975 if (!Sel)
13976 return SDValue();
13977
13978 Sel |= LHS.getConstantOperandVal(2);
13979 SDLoc DL(N);
13980 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13981 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13982 }
13983
13984 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13985 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13986 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13987 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13988
13989 // If all the uses of an or need to extract the individual elements, do not
13990 // attempt to lower into v_perm
13991 auto usesCombinedOperand = [](SDNode *OrUse) {
13992 // If we have any non-vectorized use, then it is a candidate for v_perm
13993 if (OrUse->getOpcode() != ISD::BITCAST ||
13994 !OrUse->getValueType(0).isVector())
13995 return true;
13996
13997 // If we have any non-vectorized use, then it is a candidate for v_perm
13998 for (auto *VUser : OrUse->users()) {
13999 if (!VUser->getValueType(0).isVector())
14000 return true;
14001
14002 // If the use of a vector is a store, then combining via a v_perm
14003 // is beneficial.
14004 // TODO -- whitelist more uses
14005 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
14006 if (VUser->getOpcode() == VectorwiseOp)
14007 return true;
14008 }
14009 return false;
14010 };
14011
14012 if (!any_of(N->users(), usesCombinedOperand))
14013 return SDValue();
14014
14015 uint32_t LHSMask = getPermuteMask(LHS);
14016 uint32_t RHSMask = getPermuteMask(RHS);
14017
14018 if (LHSMask != ~0u && RHSMask != ~0u) {
14019 // Canonicalize the expression in an attempt to have fewer unique masks
14020 // and therefore fewer registers used to hold the masks.
14021 if (LHSMask > RHSMask) {
14022 std::swap(LHSMask, RHSMask);
14023 std::swap(LHS, RHS);
14024 }
14025
14026 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14027 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14028 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14029 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14030
14031 // Check of we need to combine values from two sources within a byte.
14032 if (!(LHSUsedLanes & RHSUsedLanes) &&
14033 // If we select high and lower word keep it for SDWA.
14034 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14035 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14036 // Kill zero bytes selected by other mask. Zero value is 0xc.
14037 LHSMask &= ~RHSUsedLanes;
14038 RHSMask &= ~LHSUsedLanes;
14039 // Add 4 to each active LHS lane
14040 LHSMask |= LHSUsedLanes & 0x04040404;
14041 // Combine masks
14042 uint32_t Sel = LHSMask | RHSMask;
14043 SDLoc DL(N);
14044
14045 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14046 RHS.getOperand(0),
14047 DAG.getConstant(Sel, DL, MVT::i32));
14048 }
14049 }
14050 if (LHSMask == ~0u || RHSMask == ~0u) {
14051 if (SDValue Perm = matchPERM(N, DCI))
14052 return Perm;
14053 }
14054 }
14055
14056 // Detect identity v2i32 OR and replace with identity source node.
14057 // Specifically an Or that has operands constructed from the same source node
14058 // via extract_vector_elt and build_vector. I.E.
14059 // v2i32 or(
14060 // v2i32 build_vector(
14061 // i32 extract_elt(%IdentitySrc, 0),
14062 // i32 0
14063 // ),
14064 // v2i32 build_vector(
14065 // i32 0,
14066 // i32 extract_elt(%IdentitySrc, 1)
14067 // ) )
14068 // =>
14069 // v2i32 %IdentitySrc
14070
14071 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14072 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14073
14074 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14075 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14076
14077 // Test for and normalise build vectors.
14078 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14079
14080 // Get the extract_vector_element operands.
14081 SDValue LEVE = LHS->getOperand(0);
14082 SDValue REVE = RHS->getOperand(1);
14083
14084 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14086 // Check that different elements from the same vector are
14087 // extracted.
14088 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14089 LEVE->getOperand(1) != REVE->getOperand(1)) {
14090 SDValue IdentitySrc = LEVE.getOperand(0);
14091 return IdentitySrc;
14092 }
14093 }
14094 }
14095 }
14096
14097 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14098 return SDValue();
14099
14100 // TODO: This could be a generic combine with a predicate for extracting the
14101 // high half of an integer being free.
14102
14103 // (or i64:x, (zero_extend i32:y)) ->
14104 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14105 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14106 RHS.getOpcode() != ISD::ZERO_EXTEND)
14107 std::swap(LHS, RHS);
14108
14109 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14110 SDValue ExtSrc = RHS.getOperand(0);
14111 EVT SrcVT = ExtSrc.getValueType();
14112 if (SrcVT == MVT::i32) {
14113 SDLoc SL(N);
14114 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14115 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14116
14117 DCI.AddToWorklist(LowOr.getNode());
14118 DCI.AddToWorklist(HiBits.getNode());
14119
14120 SDValue Vec =
14121 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14122 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14123 }
14124 }
14125
14126 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14127 if (CRHS) {
14128 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14129 N->getOperand(0), CRHS))
14130 return Split;
14131 }
14132
14133 return SDValue();
14134}
14135
14136SDValue SITargetLowering::performXorCombine(SDNode *N,
14137 DAGCombinerInfo &DCI) const {
14138 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14139 return RV;
14140
14141 SDValue LHS = N->getOperand(0);
14142 SDValue RHS = N->getOperand(1);
14143
14144 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14145 SelectionDAG &DAG = DCI.DAG;
14146
14147 EVT VT = N->getValueType(0);
14148 if (CRHS && VT == MVT::i64) {
14149 if (SDValue Split =
14150 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14151 return Split;
14152 }
14153
14154 // v2i32 (xor (vselect cc, x, y), K) ->
14155 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14156 // replaced with source modifiers when the select is lowered to CNDMASK.
14157 unsigned Opc = LHS.getOpcode();
14158 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14159 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14160 CRHS && CRHS->getAPIntValue().isSignMask()) {
14161 SDValue CC = LHS->getOperand(0);
14162 SDValue TRUE = LHS->getOperand(1);
14163 SDValue FALSE = LHS->getOperand(2);
14164 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14165 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14166 SDValue XSelect =
14167 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14168 return XSelect;
14169 }
14170
14171 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14172 // fneg-like xors into 64-bit select.
14173 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14174 // This looks like an fneg, try to fold as a source modifier.
14175 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14177 // xor (select c, a, b), 0x80000000 ->
14178 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14179 SDLoc DL(N);
14180 SDValue CastLHS =
14181 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14182 SDValue CastRHS =
14183 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14184 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14185 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14186 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14187 LHS->getOperand(0), FNegLHS, FNegRHS);
14188 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14189 }
14190 }
14191
14192 return SDValue();
14193}
14194
14195SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14196 DAGCombinerInfo &DCI) const {
14197 if (!Subtarget->has16BitInsts() ||
14198 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14199 return SDValue();
14200
14201 EVT VT = N->getValueType(0);
14202 if (VT != MVT::i32)
14203 return SDValue();
14204
14205 SDValue Src = N->getOperand(0);
14206 if (Src.getValueType() != MVT::i16)
14207 return SDValue();
14208
14209 return SDValue();
14210}
14211
14212SDValue
14213SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14214 DAGCombinerInfo &DCI) const {
14215 SDValue Src = N->getOperand(0);
14216 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14217
14218 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14219 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14220 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14221 VTSign->getVT() == MVT::i8) ||
14222 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14223 VTSign->getVT() == MVT::i16))) {
14224 assert(Subtarget->hasScalarSubwordLoads() &&
14225 "s_buffer_load_{u8, i8} are supported "
14226 "in GFX12 (or newer) architectures.");
14227 EVT VT = Src.getValueType();
14228 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14231 SDLoc DL(N);
14232 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14233 SDValue Ops[] = {
14234 Src.getOperand(0), // source register
14235 Src.getOperand(1), // offset
14236 Src.getOperand(2) // cachePolicy
14237 };
14238 auto *M = cast<MemSDNode>(Src);
14239 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14240 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14241 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14242 return LoadVal;
14243 }
14244 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14245 VTSign->getVT() == MVT::i8) ||
14246 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14247 VTSign->getVT() == MVT::i16)) &&
14248 Src.hasOneUse()) {
14249 auto *M = cast<MemSDNode>(Src);
14250 SDValue Ops[] = {Src.getOperand(0), // Chain
14251 Src.getOperand(1), // rsrc
14252 Src.getOperand(2), // vindex
14253 Src.getOperand(3), // voffset
14254 Src.getOperand(4), // soffset
14255 Src.getOperand(5), // offset
14256 Src.getOperand(6), Src.getOperand(7)};
14257 // replace with BUFFER_LOAD_BYTE/SHORT
14258 SDVTList ResList =
14259 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14260 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14263 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14264 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14265 return DCI.DAG.getMergeValues(
14266 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14267 }
14268 return SDValue();
14269}
14270
14271SDValue SITargetLowering::performClassCombine(SDNode *N,
14272 DAGCombinerInfo &DCI) const {
14273 SelectionDAG &DAG = DCI.DAG;
14274 SDValue Mask = N->getOperand(1);
14275
14276 // fp_class x, 0 -> false
14277 if (isNullConstant(Mask))
14278 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14279
14280 if (N->getOperand(0).isUndef())
14281 return DAG.getUNDEF(MVT::i1);
14282
14283 return SDValue();
14284}
14285
14286SDValue SITargetLowering::performRcpCombine(SDNode *N,
14287 DAGCombinerInfo &DCI) const {
14288 EVT VT = N->getValueType(0);
14289 SDValue N0 = N->getOperand(0);
14290
14291 if (N0.isUndef()) {
14292 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14293 SDLoc(N), VT);
14294 }
14295
14296 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14297 N0.getOpcode() == ISD::SINT_TO_FP)) {
14298 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14299 N->getFlags());
14300 }
14301
14302 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14303 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14304 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14305 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14306 N->getFlags());
14307 }
14308
14310}
14311
14313 unsigned MaxDepth) const {
14314 unsigned Opcode = Op.getOpcode();
14315 if (Opcode == ISD::FCANONICALIZE)
14316 return true;
14317
14318 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14319 const auto &F = CFP->getValueAPF();
14320 if (F.isNaN() && F.isSignaling())
14321 return false;
14322 if (!F.isDenormal())
14323 return true;
14324
14325 DenormalMode Mode =
14326 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14327 return Mode == DenormalMode::getIEEE();
14328 }
14329
14330 // If source is a result of another standard FP operation it is already in
14331 // canonical form.
14332 if (MaxDepth == 0)
14333 return false;
14334
14335 switch (Opcode) {
14336 // These will flush denorms if required.
14337 case ISD::FADD:
14338 case ISD::FSUB:
14339 case ISD::FMUL:
14340 case ISD::FCEIL:
14341 case ISD::FFLOOR:
14342 case ISD::FMA:
14343 case ISD::FMAD:
14344 case ISD::FSQRT:
14345 case ISD::FDIV:
14346 case ISD::FREM:
14347 case ISD::FP_ROUND:
14348 case ISD::FP_EXTEND:
14349 case ISD::FP16_TO_FP:
14350 case ISD::FP_TO_FP16:
14351 case ISD::BF16_TO_FP:
14352 case ISD::FP_TO_BF16:
14353 case ISD::FLDEXP:
14356 case AMDGPUISD::RCP:
14357 case AMDGPUISD::RSQ:
14361 case AMDGPUISD::LOG:
14362 case AMDGPUISD::EXP:
14366 case AMDGPUISD::FRACT:
14373 case AMDGPUISD::SIN_HW:
14374 case AMDGPUISD::COS_HW:
14375 return true;
14376
14377 // It can/will be lowered or combined as a bit operation.
14378 // Need to check their input recursively to handle.
14379 case ISD::FNEG:
14380 case ISD::FABS:
14381 case ISD::FCOPYSIGN:
14382 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14383
14384 case ISD::AND:
14385 if (Op.getValueType() == MVT::i32) {
14386 // Be careful as we only know it is a bitcast floating point type. It
14387 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14388 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14389 // is valid to optimize for all types.
14390 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14391 if (RHS->getZExtValue() == 0xffff0000) {
14392 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14393 }
14394 }
14395 }
14396 break;
14397
14398 case ISD::FSIN:
14399 case ISD::FCOS:
14400 case ISD::FSINCOS:
14401 return Op.getValueType().getScalarType() != MVT::f16;
14402
14403 case ISD::FMINNUM:
14404 case ISD::FMAXNUM:
14405 case ISD::FMINNUM_IEEE:
14406 case ISD::FMAXNUM_IEEE:
14407 case ISD::FMINIMUM:
14408 case ISD::FMAXIMUM:
14409 case ISD::FMINIMUMNUM:
14410 case ISD::FMAXIMUMNUM:
14411 case AMDGPUISD::CLAMP:
14412 case AMDGPUISD::FMED3:
14413 case AMDGPUISD::FMAX3:
14414 case AMDGPUISD::FMIN3:
14416 case AMDGPUISD::FMINIMUM3: {
14417 // FIXME: Shouldn't treat the generic operations different based these.
14418 // However, we aren't really required to flush the result from
14419 // minnum/maxnum..
14420
14421 // snans will be quieted, so we only need to worry about denormals.
14422 if (Subtarget->supportsMinMaxDenormModes() ||
14423 // FIXME: denormalsEnabledForType is broken for dynamic
14424 denormalsEnabledForType(DAG, Op.getValueType()))
14425 return true;
14426
14427 // Flushing may be required.
14428 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14429 // targets need to check their input recursively.
14430
14431 // FIXME: Does this apply with clamp? It's implemented with max.
14432 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14433 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14434 return false;
14435 }
14436
14437 return true;
14438 }
14439 case ISD::SELECT: {
14440 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14441 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14442 }
14443 case ISD::BUILD_VECTOR: {
14444 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14445 SDValue SrcOp = Op.getOperand(i);
14446 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14447 return false;
14448 }
14449
14450 return true;
14451 }
14454 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14455 }
14457 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14458 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14459 }
14460 case ISD::UNDEF:
14461 // Could be anything.
14462 return false;
14463
14464 case ISD::BITCAST:
14465 // TODO: This is incorrect as it loses track of the operand's type. We may
14466 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14467 // same bits that are canonicalized in one type need not be in the other.
14468 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14469 case ISD::TRUNCATE: {
14470 // Hack round the mess we make when legalizing extract_vector_elt
14471 if (Op.getValueType() == MVT::i16) {
14472 SDValue TruncSrc = Op.getOperand(0);
14473 if (TruncSrc.getValueType() == MVT::i32 &&
14474 TruncSrc.getOpcode() == ISD::BITCAST &&
14475 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14476 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14477 }
14478 }
14479 return false;
14480 }
14482 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14483 // TODO: Handle more intrinsics
14484 switch (IntrinsicID) {
14485 case Intrinsic::amdgcn_cvt_pkrtz:
14486 case Intrinsic::amdgcn_cubeid:
14487 case Intrinsic::amdgcn_frexp_mant:
14488 case Intrinsic::amdgcn_fdot2:
14489 case Intrinsic::amdgcn_rcp:
14490 case Intrinsic::amdgcn_rsq:
14491 case Intrinsic::amdgcn_rsq_clamp:
14492 case Intrinsic::amdgcn_rcp_legacy:
14493 case Intrinsic::amdgcn_rsq_legacy:
14494 case Intrinsic::amdgcn_trig_preop:
14495 case Intrinsic::amdgcn_tanh:
14496 case Intrinsic::amdgcn_log:
14497 case Intrinsic::amdgcn_exp2:
14498 case Intrinsic::amdgcn_sqrt:
14499 return true;
14500 default:
14501 break;
14502 }
14503
14504 break;
14505 }
14506 default:
14507 break;
14508 }
14509
14510 // FIXME: denormalsEnabledForType is broken for dynamic
14511 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14512 DAG.isKnownNeverSNaN(Op);
14513}
14514
14516 unsigned MaxDepth) const {
14517 const MachineRegisterInfo &MRI = MF.getRegInfo();
14518 MachineInstr *MI = MRI.getVRegDef(Reg);
14519 unsigned Opcode = MI->getOpcode();
14520
14521 if (Opcode == AMDGPU::G_FCANONICALIZE)
14522 return true;
14523
14524 std::optional<FPValueAndVReg> FCR;
14525 // Constant splat (can be padded with undef) or scalar constant.
14527 if (FCR->Value.isSignaling())
14528 return false;
14529 if (!FCR->Value.isDenormal())
14530 return true;
14531
14532 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14533 return Mode == DenormalMode::getIEEE();
14534 }
14535
14536 if (MaxDepth == 0)
14537 return false;
14538
14539 switch (Opcode) {
14540 case AMDGPU::G_FADD:
14541 case AMDGPU::G_FSUB:
14542 case AMDGPU::G_FMUL:
14543 case AMDGPU::G_FCEIL:
14544 case AMDGPU::G_FFLOOR:
14545 case AMDGPU::G_FRINT:
14546 case AMDGPU::G_FNEARBYINT:
14547 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14548 case AMDGPU::G_INTRINSIC_TRUNC:
14549 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14550 case AMDGPU::G_FMA:
14551 case AMDGPU::G_FMAD:
14552 case AMDGPU::G_FSQRT:
14553 case AMDGPU::G_FDIV:
14554 case AMDGPU::G_FREM:
14555 case AMDGPU::G_FPOW:
14556 case AMDGPU::G_FPEXT:
14557 case AMDGPU::G_FLOG:
14558 case AMDGPU::G_FLOG2:
14559 case AMDGPU::G_FLOG10:
14560 case AMDGPU::G_FPTRUNC:
14561 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14562 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14563 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14564 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14565 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14566 return true;
14567 case AMDGPU::G_FNEG:
14568 case AMDGPU::G_FABS:
14569 case AMDGPU::G_FCOPYSIGN:
14570 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14571 case AMDGPU::G_FMINNUM:
14572 case AMDGPU::G_FMAXNUM:
14573 case AMDGPU::G_FMINNUM_IEEE:
14574 case AMDGPU::G_FMAXNUM_IEEE:
14575 case AMDGPU::G_FMINIMUM:
14576 case AMDGPU::G_FMAXIMUM:
14577 case AMDGPU::G_FMINIMUMNUM:
14578 case AMDGPU::G_FMAXIMUMNUM: {
14579 if (Subtarget->supportsMinMaxDenormModes() ||
14580 // FIXME: denormalsEnabledForType is broken for dynamic
14581 denormalsEnabledForType(MRI.getType(Reg), MF))
14582 return true;
14583
14584 [[fallthrough]];
14585 }
14586 case AMDGPU::G_BUILD_VECTOR:
14587 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14588 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14589 return false;
14590 return true;
14591 case AMDGPU::G_INTRINSIC:
14592 case AMDGPU::G_INTRINSIC_CONVERGENT:
14593 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14594 case Intrinsic::amdgcn_fmul_legacy:
14595 case Intrinsic::amdgcn_fmad_ftz:
14596 case Intrinsic::amdgcn_sqrt:
14597 case Intrinsic::amdgcn_fmed3:
14598 case Intrinsic::amdgcn_sin:
14599 case Intrinsic::amdgcn_cos:
14600 case Intrinsic::amdgcn_log:
14601 case Intrinsic::amdgcn_exp2:
14602 case Intrinsic::amdgcn_log_clamp:
14603 case Intrinsic::amdgcn_rcp:
14604 case Intrinsic::amdgcn_rcp_legacy:
14605 case Intrinsic::amdgcn_rsq:
14606 case Intrinsic::amdgcn_rsq_clamp:
14607 case Intrinsic::amdgcn_rsq_legacy:
14608 case Intrinsic::amdgcn_div_scale:
14609 case Intrinsic::amdgcn_div_fmas:
14610 case Intrinsic::amdgcn_div_fixup:
14611 case Intrinsic::amdgcn_fract:
14612 case Intrinsic::amdgcn_cvt_pkrtz:
14613 case Intrinsic::amdgcn_cubeid:
14614 case Intrinsic::amdgcn_cubema:
14615 case Intrinsic::amdgcn_cubesc:
14616 case Intrinsic::amdgcn_cubetc:
14617 case Intrinsic::amdgcn_frexp_mant:
14618 case Intrinsic::amdgcn_fdot2:
14619 case Intrinsic::amdgcn_trig_preop:
14620 case Intrinsic::amdgcn_tanh:
14621 return true;
14622 default:
14623 break;
14624 }
14625
14626 [[fallthrough]];
14627 default:
14628 return false;
14629 }
14630
14631 llvm_unreachable("invalid operation");
14632}
14633
14634// Constant fold canonicalize.
14635SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14636 const SDLoc &SL, EVT VT,
14637 const APFloat &C) const {
14638 // Flush denormals to 0 if not enabled.
14639 if (C.isDenormal()) {
14640 DenormalMode Mode =
14641 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14642 if (Mode == DenormalMode::getPreserveSign()) {
14643 return DAG.getConstantFP(
14644 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14645 }
14646
14647 if (Mode != DenormalMode::getIEEE())
14648 return SDValue();
14649 }
14650
14651 if (C.isNaN()) {
14652 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14653 if (C.isSignaling()) {
14654 // Quiet a signaling NaN.
14655 // FIXME: Is this supposed to preserve payload bits?
14656 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14657 }
14658
14659 // Make sure it is the canonical NaN bitpattern.
14660 //
14661 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14662 // immediate?
14663 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14664 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14665 }
14666
14667 // Already canonical.
14668 return DAG.getConstantFP(C, SL, VT);
14669}
14670
14672 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14673}
14674
14675SDValue
14676SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14677 DAGCombinerInfo &DCI) const {
14678 SelectionDAG &DAG = DCI.DAG;
14679 SDValue N0 = N->getOperand(0);
14680 EVT VT = N->getValueType(0);
14681
14682 // fcanonicalize undef -> qnan
14683 if (N0.isUndef()) {
14685 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14686 }
14687
14688 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14689 EVT VT = N->getValueType(0);
14690 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14691 }
14692
14693 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14694 // (fcanonicalize k)
14695 //
14696 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14697
14698 // TODO: This could be better with wider vectors that will be split to v2f16,
14699 // and to consider uses since there aren't that many packed operations.
14700 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14701 isTypeLegal(MVT::v2f16)) {
14702 SDLoc SL(N);
14703 SDValue NewElts[2];
14704 SDValue Lo = N0.getOperand(0);
14705 SDValue Hi = N0.getOperand(1);
14706 EVT EltVT = Lo.getValueType();
14707
14709 for (unsigned I = 0; I != 2; ++I) {
14710 SDValue Op = N0.getOperand(I);
14711 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14712 NewElts[I] =
14713 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14714 } else if (Op.isUndef()) {
14715 // Handled below based on what the other operand is.
14716 NewElts[I] = Op;
14717 } else {
14718 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14719 }
14720 }
14721
14722 // If one half is undef, and one is constant, prefer a splat vector rather
14723 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14724 // cheaper to use and may be free with a packed operation.
14725 if (NewElts[0].isUndef()) {
14726 if (isa<ConstantFPSDNode>(NewElts[1]))
14727 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14728 ? NewElts[1]
14729 : DAG.getConstantFP(0.0f, SL, EltVT);
14730 }
14731
14732 if (NewElts[1].isUndef()) {
14733 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14734 ? NewElts[0]
14735 : DAG.getConstantFP(0.0f, SL, EltVT);
14736 }
14737
14738 return DAG.getBuildVector(VT, SL, NewElts);
14739 }
14740 }
14741
14742 return SDValue();
14743}
14744
14745static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14746 switch (Opc) {
14747 case ISD::FMAXNUM:
14748 case ISD::FMAXNUM_IEEE:
14749 case ISD::FMAXIMUMNUM:
14750 return AMDGPUISD::FMAX3;
14751 case ISD::FMAXIMUM:
14752 return AMDGPUISD::FMAXIMUM3;
14753 case ISD::SMAX:
14754 return AMDGPUISD::SMAX3;
14755 case ISD::UMAX:
14756 return AMDGPUISD::UMAX3;
14757 case ISD::FMINNUM:
14758 case ISD::FMINNUM_IEEE:
14759 case ISD::FMINIMUMNUM:
14760 return AMDGPUISD::FMIN3;
14761 case ISD::FMINIMUM:
14762 return AMDGPUISD::FMINIMUM3;
14763 case ISD::SMIN:
14764 return AMDGPUISD::SMIN3;
14765 case ISD::UMIN:
14766 return AMDGPUISD::UMIN3;
14767 default:
14768 llvm_unreachable("Not a min/max opcode");
14769 }
14770}
14771
14772SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14773 const SDLoc &SL, SDValue Src,
14774 SDValue MinVal,
14775 SDValue MaxVal,
14776 bool Signed) const {
14777
14778 // med3 comes from
14779 // min(max(x, K0), K1), K0 < K1
14780 // max(min(x, K0), K1), K1 < K0
14781 //
14782 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14783 // min/max op.
14784 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14785 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14786
14787 if (!MinK || !MaxK)
14788 return SDValue();
14789
14790 if (Signed) {
14791 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14792 return SDValue();
14793 } else {
14794 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14795 return SDValue();
14796 }
14797
14798 EVT VT = MinK->getValueType(0);
14799 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14800 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14801 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14802
14803 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14804 // not available, but this is unlikely to be profitable as constants
14805 // will often need to be materialized & extended, especially on
14806 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14807 return SDValue();
14808}
14809
14812 return C;
14813
14815 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14816 return C;
14817 }
14818
14819 return nullptr;
14820}
14821
14822SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14823 const SDLoc &SL, SDValue Op0,
14824 SDValue Op1) const {
14825 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14826 if (!K1)
14827 return SDValue();
14828
14829 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14830 if (!K0)
14831 return SDValue();
14832
14833 // Ordered >= (although NaN inputs should have folded away by now).
14834 if (K0->getValueAPF() > K1->getValueAPF())
14835 return SDValue();
14836
14837 // med3 with a nan input acts like
14838 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14839 //
14840 // So the result depends on whether the IEEE mode bit is enabled or not with a
14841 // signaling nan input.
14842 // ieee=1
14843 // s0 snan: yields s2
14844 // s1 snan: yields s2
14845 // s2 snan: qnan
14846
14847 // s0 qnan: min(s1, s2)
14848 // s1 qnan: min(s0, s2)
14849 // s2 qnan: min(s0, s1)
14850
14851 // ieee=0
14852 // s0 snan: min(s1, s2)
14853 // s1 snan: min(s0, s2)
14854 // s2 snan: qnan
14855
14856 // s0 qnan: min(s1, s2)
14857 // s1 qnan: min(s0, s2)
14858 // s2 qnan: min(s0, s1)
14859 const MachineFunction &MF = DAG.getMachineFunction();
14860 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14861
14862 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14863 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14864 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14865 EVT VT = Op0.getValueType();
14866 if (Info->getMode().DX10Clamp) {
14867 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14868 // hardware fmed3 behavior converting to a min.
14869 // FIXME: Should this be allowing -0.0?
14870 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14871 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14872 }
14873
14874 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14875 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14876 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14877 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14878 // then give the other result, which is different from med3 with a NaN
14879 // input.
14880 SDValue Var = Op0.getOperand(0);
14881 if (!DAG.isKnownNeverSNaN(Var))
14882 return SDValue();
14883
14884 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14885
14886 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14887 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14888 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14889 SDValue(K0, 0), SDValue(K1, 0));
14890 }
14891 }
14892
14893 return SDValue();
14894}
14895
14896/// \return true if the subtarget supports minimum3 and maximum3 with the given
14897/// base min/max opcode \p Opc for type \p VT.
14898static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14899 EVT VT) {
14900 switch (Opc) {
14901 case ISD::FMINNUM:
14902 case ISD::FMAXNUM:
14903 case ISD::FMINNUM_IEEE:
14904 case ISD::FMAXNUM_IEEE:
14905 case ISD::FMINIMUMNUM:
14906 case ISD::FMAXIMUMNUM:
14909 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14910 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14911 case ISD::FMINIMUM:
14912 case ISD::FMAXIMUM:
14913 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14914 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14915 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14916 case ISD::SMAX:
14917 case ISD::SMIN:
14918 case ISD::UMAX:
14919 case ISD::UMIN:
14920 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14921 default:
14922 return false;
14923 }
14924
14925 llvm_unreachable("not a min/max opcode");
14926}
14927
14928SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14929 DAGCombinerInfo &DCI) const {
14930 SelectionDAG &DAG = DCI.DAG;
14931
14932 EVT VT = N->getValueType(0);
14933 unsigned Opc = N->getOpcode();
14934 SDValue Op0 = N->getOperand(0);
14935 SDValue Op1 = N->getOperand(1);
14936
14937 // Only do this if the inner op has one use since this will just increases
14938 // register pressure for no benefit.
14939
14940 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14941 // max(max(a, b), c) -> max3(a, b, c)
14942 // min(min(a, b), c) -> min3(a, b, c)
14943 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14944 SDLoc DL(N);
14945 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14946 Op0.getOperand(0), Op0.getOperand(1), Op1);
14947 }
14948
14949 // Try commuted.
14950 // max(a, max(b, c)) -> max3(a, b, c)
14951 // min(a, min(b, c)) -> min3(a, b, c)
14952 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14953 SDLoc DL(N);
14954 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14955 Op0, Op1.getOperand(0), Op1.getOperand(1));
14956 }
14957 }
14958
14959 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14960 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14961 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14962 if (SDValue Med3 = performIntMed3ImmCombine(
14963 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14964 return Med3;
14965 }
14966 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14967 if (SDValue Med3 = performIntMed3ImmCombine(
14968 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14969 return Med3;
14970 }
14971
14972 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14973 if (SDValue Med3 = performIntMed3ImmCombine(
14974 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14975 return Med3;
14976 }
14977 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14978 if (SDValue Med3 = performIntMed3ImmCombine(
14979 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14980 return Med3;
14981 }
14982
14983 // if !is_snan(x):
14984 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14985 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14986 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14987 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14988 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14989 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14990 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14992 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14993 (VT == MVT::f32 || VT == MVT::f64 ||
14994 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14995 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14996 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14997 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14998 Op0.hasOneUse()) {
14999 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
15000 return Res;
15001 }
15002
15003 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
15004 // for some types, but at a higher cost since it's implemented with a 3
15005 // operand form.
15006 const SDNodeFlags Flags = N->getFlags();
15007 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
15008 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15009 unsigned NewOpc =
15010 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15011 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15012 }
15013
15014 return SDValue();
15015}
15016
15020 // FIXME: Should this be allowing -0.0?
15021 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15022 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15023 }
15024 }
15025
15026 return false;
15027}
15028
15029// FIXME: Should only worry about snans for version with chain.
15030SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15031 DAGCombinerInfo &DCI) const {
15032 EVT VT = N->getValueType(0);
15033 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15034 // NaNs. With a NaN input, the order of the operands may change the result.
15035
15036 SelectionDAG &DAG = DCI.DAG;
15037 SDLoc SL(N);
15038
15039 SDValue Src0 = N->getOperand(0);
15040 SDValue Src1 = N->getOperand(1);
15041 SDValue Src2 = N->getOperand(2);
15042
15043 if (isClampZeroToOne(Src0, Src1)) {
15044 // const_a, const_b, x -> clamp is safe in all cases including signaling
15045 // nans.
15046 // FIXME: Should this be allowing -0.0?
15047 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15048 }
15049
15050 const MachineFunction &MF = DAG.getMachineFunction();
15051 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15052
15053 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15054 // handling no dx10-clamp?
15055 if (Info->getMode().DX10Clamp) {
15056 // If NaNs is clamped to 0, we are free to reorder the inputs.
15057
15058 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15059 std::swap(Src0, Src1);
15060
15061 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15062 std::swap(Src1, Src2);
15063
15064 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15065 std::swap(Src0, Src1);
15066
15067 if (isClampZeroToOne(Src1, Src2))
15068 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15069 }
15070
15071 return SDValue();
15072}
15073
15074SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15075 DAGCombinerInfo &DCI) const {
15076 SDValue Src0 = N->getOperand(0);
15077 SDValue Src1 = N->getOperand(1);
15078 if (Src0.isUndef() && Src1.isUndef())
15079 return DCI.DAG.getUNDEF(N->getValueType(0));
15080 return SDValue();
15081}
15082
15083// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15084// expanded into a set of cmp/select instructions.
15086 unsigned NumElem,
15087 bool IsDivergentIdx,
15088 const GCNSubtarget *Subtarget) {
15090 return false;
15091
15092 unsigned VecSize = EltSize * NumElem;
15093
15094 // Sub-dword vectors of size 2 dword or less have better implementation.
15095 if (VecSize <= 64 && EltSize < 32)
15096 return false;
15097
15098 // Always expand the rest of sub-dword instructions, otherwise it will be
15099 // lowered via memory.
15100 if (EltSize < 32)
15101 return true;
15102
15103 // Always do this if var-idx is divergent, otherwise it will become a loop.
15104 if (IsDivergentIdx)
15105 return true;
15106
15107 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15108 unsigned NumInsts = NumElem /* Number of compares */ +
15109 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15110
15111 // On some architectures (GFX9) movrel is not available and it's better
15112 // to expand.
15113 if (Subtarget->useVGPRIndexMode())
15114 return NumInsts <= 16;
15115
15116 // If movrel is available, use it instead of expanding for vector of 8
15117 // elements.
15118 if (Subtarget->hasMovrel())
15119 return NumInsts <= 15;
15120
15121 return true;
15122}
15123
15125 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15126 if (isa<ConstantSDNode>(Idx))
15127 return false;
15128
15129 SDValue Vec = N->getOperand(0);
15130 EVT VecVT = Vec.getValueType();
15131 EVT EltVT = VecVT.getVectorElementType();
15132 unsigned EltSize = EltVT.getSizeInBits();
15133 unsigned NumElem = VecVT.getVectorNumElements();
15134
15136 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15137}
15138
15139SDValue
15140SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15141 DAGCombinerInfo &DCI) const {
15142 SDValue Vec = N->getOperand(0);
15143 SelectionDAG &DAG = DCI.DAG;
15144
15145 EVT VecVT = Vec.getValueType();
15146 EVT VecEltVT = VecVT.getVectorElementType();
15147 EVT ResVT = N->getValueType(0);
15148
15149 unsigned VecSize = VecVT.getSizeInBits();
15150 unsigned VecEltSize = VecEltVT.getSizeInBits();
15151
15152 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15154 SDLoc SL(N);
15155 SDValue Idx = N->getOperand(1);
15156 SDValue Elt =
15157 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15158 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15159 }
15160
15161 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15162 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15163 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15164 // depending on the shift operand. See e.g. performSraCombine().
15165 // This combine ensures that the optimisation is compatible with v2i32
15166 // legalised AND.
15167 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15168 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15169
15171 if (!C || C->getZExtValue() != 0x1f)
15172 return SDValue();
15173
15174 SDLoc SL(N);
15175 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15176 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15177 Vec->getOperand(0), N->getOperand(1));
15178 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15179 DAG.ReplaceAllUsesWith(N, A.getNode());
15180 }
15181
15182 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15183 // =>
15184 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15185 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15186 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15187 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15188 SDLoc SL(N);
15189 SDValue Idx = N->getOperand(1);
15190 unsigned Opc = Vec.getOpcode();
15191
15192 switch (Opc) {
15193 default:
15194 break;
15195 // TODO: Support other binary operations.
15196 case ISD::FADD:
15197 case ISD::FSUB:
15198 case ISD::FMUL:
15199 case ISD::ADD:
15200 case ISD::UMIN:
15201 case ISD::UMAX:
15202 case ISD::SMIN:
15203 case ISD::SMAX:
15204 case ISD::FMAXNUM:
15205 case ISD::FMINNUM:
15206 case ISD::FMAXNUM_IEEE:
15207 case ISD::FMINNUM_IEEE:
15208 case ISD::FMAXIMUM:
15209 case ISD::FMINIMUM: {
15210 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15211 Vec.getOperand(0), Idx);
15212 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15213 Vec.getOperand(1), Idx);
15214
15215 DCI.AddToWorklist(Elt0.getNode());
15216 DCI.AddToWorklist(Elt1.getNode());
15217 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15218 }
15219 }
15220 }
15221
15222 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15224 SDLoc SL(N);
15225 SDValue Idx = N->getOperand(1);
15226 SDValue V;
15227 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15228 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15229 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15230 if (I == 0)
15231 V = Elt;
15232 else
15233 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15234 }
15235 return V;
15236 }
15237
15238 if (!DCI.isBeforeLegalize())
15239 return SDValue();
15240
15241 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15242 // elements. This exposes more load reduction opportunities by replacing
15243 // multiple small extract_vector_elements with a single 32-bit extract.
15244 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15245 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15246 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15247 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15248
15249 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15250 unsigned EltIdx = BitIndex / 32;
15251 unsigned LeftoverBitIdx = BitIndex % 32;
15252 SDLoc SL(N);
15253
15254 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15255 DCI.AddToWorklist(Cast.getNode());
15256
15257 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15258 DAG.getConstant(EltIdx, SL, MVT::i32));
15259 DCI.AddToWorklist(Elt.getNode());
15260 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15261 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15262 DCI.AddToWorklist(Srl.getNode());
15263
15264 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15265 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15266 DCI.AddToWorklist(Trunc.getNode());
15267
15268 if (VecEltVT == ResVT) {
15269 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15270 }
15271
15272 assert(ResVT.isScalarInteger());
15273 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15274 }
15275
15276 return SDValue();
15277}
15278
15279SDValue
15280SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15281 DAGCombinerInfo &DCI) const {
15282 SDValue Vec = N->getOperand(0);
15283 SDValue Idx = N->getOperand(2);
15284 EVT VecVT = Vec.getValueType();
15285 EVT EltVT = VecVT.getVectorElementType();
15286
15287 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15288 // => BUILD_VECTOR n x select (e, const-idx)
15290 return SDValue();
15291
15292 SelectionDAG &DAG = DCI.DAG;
15293 SDLoc SL(N);
15294 SDValue Ins = N->getOperand(1);
15295 EVT IdxVT = Idx.getValueType();
15296
15298 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15299 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15300 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15301 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15302 Ops.push_back(V);
15303 }
15304
15305 return DAG.getBuildVector(VecVT, SL, Ops);
15306}
15307
15308/// Return the source of an fp_extend from f16 to f32, or a converted FP
15309/// constant.
15311 if (Src.getOpcode() == ISD::FP_EXTEND &&
15312 Src.getOperand(0).getValueType() == MVT::f16) {
15313 return Src.getOperand(0);
15314 }
15315
15316 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15317 APFloat Val = CFP->getValueAPF();
15318 bool LosesInfo = true;
15320 if (!LosesInfo)
15321 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15322 }
15323
15324 return SDValue();
15325}
15326
15327SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15328 DAGCombinerInfo &DCI) const {
15329 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15330 "combine only useful on gfx8");
15331
15332 SDValue TruncSrc = N->getOperand(0);
15333 EVT VT = N->getValueType(0);
15334 if (VT != MVT::f16)
15335 return SDValue();
15336
15337 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15338 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15339 return SDValue();
15340
15341 SelectionDAG &DAG = DCI.DAG;
15342 SDLoc SL(N);
15343
15344 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15345 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15346 // casting back.
15347
15348 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15349 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15350 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15351 if (!A)
15352 return SDValue();
15353
15354 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15355 if (!B)
15356 return SDValue();
15357
15358 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15359 if (!C)
15360 return SDValue();
15361
15362 // This changes signaling nan behavior. If an input is a signaling nan, it
15363 // would have been quieted by the fpext originally. We don't care because
15364 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15365 // we would be worse off than just doing the promotion.
15366 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15367 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15368 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15369 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15370}
15371
15372unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15373 const SDNode *N0,
15374 const SDNode *N1) const {
15375 EVT VT = N0->getValueType(0);
15376
15377 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15378 // support denormals ever.
15379 if (((VT == MVT::f32 &&
15381 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15384 return ISD::FMAD;
15385
15386 const TargetOptions &Options = DAG.getTarget().Options;
15387 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15388 (N0->getFlags().hasAllowContract() &&
15389 N1->getFlags().hasAllowContract())) &&
15391 return ISD::FMA;
15392 }
15393
15394 return 0;
15395}
15396
15397// For a reassociatable opcode perform:
15398// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15399SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15400 SelectionDAG &DAG) const {
15401 EVT VT = N->getValueType(0);
15402 if (VT != MVT::i32 && VT != MVT::i64)
15403 return SDValue();
15404
15405 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15406 return SDValue();
15407
15408 unsigned Opc = N->getOpcode();
15409 SDValue Op0 = N->getOperand(0);
15410 SDValue Op1 = N->getOperand(1);
15411
15412 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15413 return SDValue();
15414
15415 if (Op0->isDivergent())
15416 std::swap(Op0, Op1);
15417
15418 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15419 return SDValue();
15420
15421 SDValue Op2 = Op1.getOperand(1);
15422 Op1 = Op1.getOperand(0);
15423 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15424 return SDValue();
15425
15426 if (Op1->isDivergent())
15427 std::swap(Op1, Op2);
15428
15429 SDLoc SL(N);
15430 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15431 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15432}
15433
15434static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15435 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15437 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15438 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15439 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15440}
15441
15442// Fold
15443// y = lshr i64 x, 32
15444// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15445// with Const.hi == -1
15446// To
15447// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15449 SDValue MulLHS, SDValue MulRHS,
15450 SDValue AddRHS) {
15451 if (MulRHS.getOpcode() == ISD::SRL)
15452 std::swap(MulLHS, MulRHS);
15453
15454 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15455 return SDValue();
15456
15457 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15458 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15459 MulLHS.getOperand(0) != AddRHS)
15460 return SDValue();
15461
15463 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15464 return SDValue();
15465
15466 SDValue ConstMul =
15467 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15468 return getMad64_32(DAG, SL, MVT::i64,
15469 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15470 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15471}
15472
15473// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15474// multiplies, if any.
15475//
15476// Full 64-bit multiplies that feed into an addition are lowered here instead
15477// of using the generic expansion. The generic expansion ends up with
15478// a tree of ADD nodes that prevents us from using the "add" part of the
15479// MAD instruction. The expansion produced here results in a chain of ADDs
15480// instead of a tree.
15481SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15482 DAGCombinerInfo &DCI) const {
15483 assert(N->isAnyAdd());
15484
15485 SelectionDAG &DAG = DCI.DAG;
15486 EVT VT = N->getValueType(0);
15487 SDLoc SL(N);
15488 SDValue LHS = N->getOperand(0);
15489 SDValue RHS = N->getOperand(1);
15490
15491 if (VT.isVector())
15492 return SDValue();
15493
15494 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15495 // result in scalar registers for uniform values.
15496 if (!N->isDivergent() && Subtarget->hasSMulHi())
15497 return SDValue();
15498
15499 unsigned NumBits = VT.getScalarSizeInBits();
15500 if (NumBits <= 32 || NumBits > 64)
15501 return SDValue();
15502
15503 if (LHS.getOpcode() != ISD::MUL) {
15504 assert(RHS.getOpcode() == ISD::MUL);
15505 std::swap(LHS, RHS);
15506 }
15507
15508 // Avoid the fold if it would unduly increase the number of multiplies due to
15509 // multiple uses, except on hardware with full-rate multiply-add (which is
15510 // part of full-rate 64-bit ops).
15511 if (!Subtarget->hasFullRate64Ops()) {
15512 unsigned NumUsers = 0;
15513 for (SDNode *User : LHS->users()) {
15514 // There is a use that does not feed into addition, so the multiply can't
15515 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15516 if (!User->isAnyAdd())
15517 return SDValue();
15518
15519 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15520 // MUL + 3xADD + 3xADDC over 3xMAD.
15521 ++NumUsers;
15522 if (NumUsers >= 3)
15523 return SDValue();
15524 }
15525 }
15526
15527 SDValue MulLHS = LHS.getOperand(0);
15528 SDValue MulRHS = LHS.getOperand(1);
15529 SDValue AddRHS = RHS;
15530
15531 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15532 return FoldedMAD;
15533
15534 // Always check whether operands are small unsigned values, since that
15535 // knowledge is useful in more cases. Check for small signed values only if
15536 // doing so can unlock a shorter code sequence.
15537 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15538 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15539
15540 bool MulSignedLo = false;
15541 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15542 MulSignedLo =
15543 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15544 }
15545
15546 // The operands and final result all have the same number of bits. If
15547 // operands need to be extended, they can be extended with garbage. The
15548 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15549 // truncated away in the end.
15550 if (VT != MVT::i64) {
15551 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15552 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15553 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15554 }
15555
15556 // The basic code generated is conceptually straightforward. Pseudo code:
15557 //
15558 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15559 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15560 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15561 //
15562 // The second and third lines are optional, depending on whether the factors
15563 // are {sign,zero}-extended or not.
15564 //
15565 // The actual DAG is noisier than the pseudo code, but only due to
15566 // instructions that disassemble values into low and high parts, and
15567 // assemble the final result.
15568 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15569
15570 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15571 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15572 SDValue Accum =
15573 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15574
15575 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15576 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15577
15578 if (!MulLHSUnsigned32) {
15579 auto MulLHSHi =
15580 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15581 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15582 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15583 }
15584
15585 if (!MulRHSUnsigned32) {
15586 auto MulRHSHi =
15587 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15588 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15589 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15590 }
15591
15592 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15593 Accum = DAG.getBitcast(MVT::i64, Accum);
15594 }
15595
15596 if (VT != MVT::i64)
15597 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15598 return Accum;
15599}
15600
15601SDValue
15602SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15603 DAGCombinerInfo &DCI) const {
15604 SDValue RHS = N->getOperand(1);
15605 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15606 if (!CRHS)
15607 return SDValue();
15608
15609 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15610 // common.
15611 uint64_t Val = CRHS->getZExtValue();
15612 if (countr_zero(Val) >= 32) {
15613 SelectionDAG &DAG = DCI.DAG;
15614 SDLoc SL(N);
15615 SDValue LHS = N->getOperand(0);
15616
15617 // Avoid carry machinery if we know the low half of the add does not
15618 // contribute to the final result.
15619 //
15620 // add i64:x, K if computeTrailingZeros(K) >= 32
15621 // => build_pair (add x.hi, K.hi), x.lo
15622
15623 // Breaking the 64-bit add here with this strange constant is unlikely
15624 // to interfere with addressing mode patterns.
15625
15626 SDValue Hi = getHiHalf64(LHS, DAG);
15627 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15628 unsigned Opcode = N->getOpcode();
15629 if (Opcode == ISD::PTRADD)
15630 Opcode = ISD::ADD;
15631 SDValue AddHi =
15632 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15633
15634 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15635 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15636 }
15637
15638 return SDValue();
15639}
15640
15641// Collect the ultimate src of each of the mul node's operands, and confirm
15642// each operand is 8 bytes.
15643static std::optional<ByteProvider<SDValue>>
15644handleMulOperand(const SDValue &MulOperand) {
15645 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15646 if (!Byte0 || Byte0->isConstantZero()) {
15647 return std::nullopt;
15648 }
15649 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15650 if (Byte1 && !Byte1->isConstantZero()) {
15651 return std::nullopt;
15652 }
15653 return Byte0;
15654}
15655
15656static unsigned addPermMasks(unsigned First, unsigned Second) {
15657 unsigned FirstCs = First & 0x0c0c0c0c;
15658 unsigned SecondCs = Second & 0x0c0c0c0c;
15659 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15660 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15661
15662 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15663 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15664 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15665 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15666
15667 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15668}
15669
15670struct DotSrc {
15672 int64_t PermMask;
15674};
15675
15679 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15680
15681 assert(Src0.Src.has_value() && Src1.Src.has_value());
15682 // Src0s and Src1s are empty, just place arbitrarily.
15683 if (Step == 0) {
15684 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15685 Src0.SrcOffset / 4});
15686 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15687 Src1.SrcOffset / 4});
15688 return;
15689 }
15690
15691 for (int BPI = 0; BPI < 2; BPI++) {
15692 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15693 if (BPI == 1) {
15694 BPP = {Src1, Src0};
15695 }
15696 unsigned ZeroMask = 0x0c0c0c0c;
15697 unsigned FMask = 0xFF << (8 * (3 - Step));
15698
15699 unsigned FirstMask =
15700 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15701 unsigned SecondMask =
15702 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15703 // Attempt to find Src vector which contains our SDValue, if so, add our
15704 // perm mask to the existing one. If we are unable to find a match for the
15705 // first SDValue, attempt to find match for the second.
15706 int FirstGroup = -1;
15707 for (int I = 0; I < 2; I++) {
15708 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15709 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15710 return IterElt.SrcOp == *BPP.first.Src &&
15711 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15712 };
15713
15714 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15715 if (Match != Srcs.end()) {
15716 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15717 FirstGroup = I;
15718 break;
15719 }
15720 }
15721 if (FirstGroup != -1) {
15722 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15723 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15724 return IterElt.SrcOp == *BPP.second.Src &&
15725 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15726 };
15727 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15728 if (Match != Srcs.end()) {
15729 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15730 } else
15731 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15732 return;
15733 }
15734 }
15735
15736 // If we have made it here, then we could not find a match in Src0s or Src1s
15737 // for either Src0 or Src1, so just place them arbitrarily.
15738
15739 unsigned ZeroMask = 0x0c0c0c0c;
15740 unsigned FMask = 0xFF << (8 * (3 - Step));
15741
15742 Src0s.push_back(
15743 {*Src0.Src,
15744 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15745 Src0.SrcOffset / 4});
15746 Src1s.push_back(
15747 {*Src1.Src,
15748 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15749 Src1.SrcOffset / 4});
15750}
15751
15753 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15754 bool IsAny) {
15755
15756 // If we just have one source, just permute it accordingly.
15757 if (Srcs.size() == 1) {
15758 auto *Elt = Srcs.begin();
15759 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15760
15761 // v_perm will produce the original value
15762 if (Elt->PermMask == 0x3020100)
15763 return EltOp;
15764
15765 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15766 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15767 }
15768
15769 auto *FirstElt = Srcs.begin();
15770 auto *SecondElt = std::next(FirstElt);
15771
15773
15774 // If we have multiple sources in the chain, combine them via perms (using
15775 // calculated perm mask) and Ors.
15776 while (true) {
15777 auto FirstMask = FirstElt->PermMask;
15778 auto SecondMask = SecondElt->PermMask;
15779
15780 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15781 unsigned FirstPlusFour = FirstMask | 0x04040404;
15782 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15783 // original 0x0C.
15784 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15785
15786 auto PermMask = addPermMasks(FirstMask, SecondMask);
15787 auto FirstVal =
15788 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15789 auto SecondVal =
15790 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15791
15792 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15793 SecondVal,
15794 DAG.getConstant(PermMask, SL, MVT::i32)));
15795
15796 FirstElt = std::next(SecondElt);
15797 if (FirstElt == Srcs.end())
15798 break;
15799
15800 SecondElt = std::next(FirstElt);
15801 // If we only have a FirstElt, then just combine that into the cumulative
15802 // source node.
15803 if (SecondElt == Srcs.end()) {
15804 auto EltOp =
15805 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15806
15807 Perms.push_back(
15808 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15809 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15810 break;
15811 }
15812 }
15813
15814 assert(Perms.size() == 1 || Perms.size() == 2);
15815 return Perms.size() == 2
15816 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15817 : Perms[0];
15818}
15819
15820static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15821 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15822 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15823 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15824 EntryMask += ZeroMask;
15825 }
15826}
15827
15828static bool isMul(const SDValue Op) {
15829 auto Opcode = Op.getOpcode();
15830
15831 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15832 Opcode == AMDGPUISD::MUL_I24);
15833}
15834
15835static std::optional<bool>
15837 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15838 const SDValue &S1Op, const SelectionDAG &DAG) {
15839 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15840 // of the dot4 is irrelevant.
15841 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15842 return false;
15843
15844 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15845 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15846 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15847 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15848 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15849 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15850
15851 assert(!(S0IsUnsigned && S0IsSigned));
15852 assert(!(S1IsUnsigned && S1IsSigned));
15853
15854 // There are 9 possible permutations of
15855 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15856
15857 // In two permutations, the sign bits are known to be the same for both Ops,
15858 // so simply return Signed / Unsigned corresponding to the MSB
15859
15860 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15861 return S0IsSigned;
15862
15863 // In another two permutations, the sign bits are known to be opposite. In
15864 // this case return std::nullopt to indicate a bad match.
15865
15866 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15867 return std::nullopt;
15868
15869 // In the remaining five permutations, we don't know the value of the sign
15870 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15871 // the upper bits must be extension bits. Thus, the only ways for the sign
15872 // bit to be unknown is if it was sign extended from unknown value, or if it
15873 // was any extended. In either case, it is correct to use the signed
15874 // version of the signedness semantics of dot4
15875
15876 // In two of such permutations, we known the sign bit is set for
15877 // one op, and the other is unknown. It is okay to used signed version of
15878 // dot4.
15879 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15880 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15881 return true;
15882
15883 // In one such permutation, we don't know either of the sign bits. It is okay
15884 // to used the signed version of dot4.
15885 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15886 return true;
15887
15888 // In two of such permutations, we known the sign bit is unset for
15889 // one op, and the other is unknown. Return std::nullopt to indicate a
15890 // bad match.
15891 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15892 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15893 return std::nullopt;
15894
15895 llvm_unreachable("Fully covered condition");
15896}
15897
15898SDValue SITargetLowering::performAddCombine(SDNode *N,
15899 DAGCombinerInfo &DCI) const {
15900 SelectionDAG &DAG = DCI.DAG;
15901 EVT VT = N->getValueType(0);
15902 SDLoc SL(N);
15903 SDValue LHS = N->getOperand(0);
15904 SDValue RHS = N->getOperand(1);
15905
15906 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15907 if (Subtarget->hasMad64_32()) {
15908 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15909 return Folded;
15910 }
15911 }
15912
15913 if (SDValue V = reassociateScalarOps(N, DAG)) {
15914 return V;
15915 }
15916
15917 if (VT == MVT::i64) {
15918 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15919 return Folded;
15920 }
15921
15922 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15923 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15924 SDValue TempNode(N, 0);
15925 std::optional<bool> IsSigned;
15929
15930 // Match the v_dot4 tree, while collecting src nodes.
15931 int ChainLength = 0;
15932 for (int I = 0; I < 4; I++) {
15933 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15934 if (MulIdx == -1)
15935 break;
15936 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15937 if (!Src0)
15938 break;
15939 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15940 if (!Src1)
15941 break;
15942
15943 auto IterIsSigned = checkDot4MulSignedness(
15944 TempNode->getOperand(MulIdx), *Src0, *Src1,
15945 TempNode->getOperand(MulIdx)->getOperand(0),
15946 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15947 if (!IterIsSigned)
15948 break;
15949 if (!IsSigned)
15950 IsSigned = *IterIsSigned;
15951 if (*IterIsSigned != *IsSigned)
15952 break;
15953 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15954 auto AddIdx = 1 - MulIdx;
15955 // Allow the special case where add (add (mul24, 0), mul24) became ->
15956 // add (mul24, mul24).
15957 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15958 Src2s.push_back(TempNode->getOperand(AddIdx));
15959 auto Src0 =
15960 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15961 if (!Src0)
15962 break;
15963 auto Src1 =
15964 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15965 if (!Src1)
15966 break;
15967 auto IterIsSigned = checkDot4MulSignedness(
15968 TempNode->getOperand(AddIdx), *Src0, *Src1,
15969 TempNode->getOperand(AddIdx)->getOperand(0),
15970 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15971 if (!IterIsSigned)
15972 break;
15973 assert(IsSigned);
15974 if (*IterIsSigned != *IsSigned)
15975 break;
15976 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15977 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15978 ChainLength = I + 2;
15979 break;
15980 }
15981
15982 TempNode = TempNode->getOperand(AddIdx);
15983 Src2s.push_back(TempNode);
15984 ChainLength = I + 1;
15985 if (TempNode->getNumOperands() < 2)
15986 break;
15987 LHS = TempNode->getOperand(0);
15988 RHS = TempNode->getOperand(1);
15989 }
15990
15991 if (ChainLength < 2)
15992 return SDValue();
15993
15994 // Masks were constructed with assumption that we would find a chain of
15995 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15996 // 0x0c) so they do not affect dot calculation.
15997 if (ChainLength < 4) {
15998 fixMasks(Src0s, ChainLength);
15999 fixMasks(Src1s, ChainLength);
16000 }
16001
16002 SDValue Src0, Src1;
16003
16004 // If we are just using a single source for both, and have permuted the
16005 // bytes consistently, we can just use the sources without permuting
16006 // (commutation).
16007 bool UseOriginalSrc = false;
16008 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16009 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16010 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16011 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16012 SmallVector<unsigned, 4> SrcBytes;
16013 auto Src0Mask = Src0s.begin()->PermMask;
16014 SrcBytes.push_back(Src0Mask & 0xFF000000);
16015 bool UniqueEntries = true;
16016 for (auto I = 1; I < 4; I++) {
16017 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16018
16019 if (is_contained(SrcBytes, NextByte)) {
16020 UniqueEntries = false;
16021 break;
16022 }
16023 SrcBytes.push_back(NextByte);
16024 }
16025
16026 if (UniqueEntries) {
16027 UseOriginalSrc = true;
16028
16029 auto *FirstElt = Src0s.begin();
16030 auto FirstEltOp =
16031 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16032
16033 auto *SecondElt = Src1s.begin();
16034 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16035 SecondElt->DWordOffset);
16036
16037 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16038 MVT::getIntegerVT(32));
16039 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16040 MVT::getIntegerVT(32));
16041 }
16042 }
16043
16044 if (!UseOriginalSrc) {
16045 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16046 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16047 }
16048
16049 assert(IsSigned);
16050 SDValue Src2 =
16051 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16052
16053 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16054 : Intrinsic::amdgcn_udot4,
16055 SL, MVT::i64);
16056
16057 assert(!VT.isVector());
16058 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16059 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16060
16061 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16062 }
16063
16064 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16065 return SDValue();
16066
16067 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16068 // add x, sext (setcc) => usubo_carry x, 0, setcc
16069 unsigned Opc = LHS.getOpcode();
16072 std::swap(RHS, LHS);
16073
16074 Opc = RHS.getOpcode();
16075 switch (Opc) {
16076 default:
16077 break;
16078 case ISD::ZERO_EXTEND:
16079 case ISD::SIGN_EXTEND:
16080 case ISD::ANY_EXTEND: {
16081 auto Cond = RHS.getOperand(0);
16082 // If this won't be a real VOPC output, we would still need to insert an
16083 // extra instruction anyway.
16084 if (!isBoolSGPR(Cond))
16085 break;
16086 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16087 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16089 return DAG.getNode(Opc, SL, VTList, Args);
16090 }
16091 case ISD::UADDO_CARRY: {
16092 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16093 if (!isNullConstant(RHS.getOperand(1)))
16094 break;
16095 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16096 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16097 }
16098 }
16099 return SDValue();
16100}
16101
16102SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16103 DAGCombinerInfo &DCI) const {
16104 SelectionDAG &DAG = DCI.DAG;
16105 SDLoc DL(N);
16106 EVT VT = N->getValueType(0);
16107 SDValue N0 = N->getOperand(0);
16108 SDValue N1 = N->getOperand(1);
16109
16110 // The following folds transform PTRADDs into regular arithmetic in cases
16111 // where the PTRADD wouldn't be folded as an immediate offset into memory
16112 // instructions anyway. They are target-specific in that other targets might
16113 // prefer to not lose information about the pointer arithmetic.
16114
16115 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16116 // Adapted from DAGCombiner::visitADDLikeCommutative.
16117 SDValue V, K;
16118 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16119 SDNodeFlags ShlFlags = N1->getFlags();
16120 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16121 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16122 // preserved.
16123 SDNodeFlags NewShlFlags =
16124 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16126 : SDNodeFlags();
16127 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16128 DCI.AddToWorklist(Inner.getNode());
16129 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16130 }
16131
16132 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16133 // performAddCombine.
16134 if (N1.getOpcode() == ISD::MUL) {
16135 if (Subtarget->hasMad64_32()) {
16136 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16137 return Folded;
16138 }
16139 }
16140
16141 // If the 32 low bits of the constant are all zero, there is nothing to fold
16142 // into an immediate offset, so it's better to eliminate the unnecessary
16143 // addition for the lower 32 bits than to preserve the PTRADD.
16144 // Analogous to a fold in performAddCombine.
16145 if (VT == MVT::i64) {
16146 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16147 return Folded;
16148 }
16149
16150 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16151 return SDValue();
16152
16153 SDValue X = N0;
16154 SDValue Y = N1.getOperand(0);
16155 SDValue Z = N1.getOperand(1);
16156 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16157 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16158
16159 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16160 Y->isDivergent() != Z->isDivergent()) {
16161 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16162 // y are uniform and z isn't.
16163 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16164 // z are uniform and y isn't.
16165 // The goal is to push uniform operands up in the computation, so that they
16166 // can be handled with scalar operations. We can't use reassociateScalarOps
16167 // for this since it requires two identical commutative operations to
16168 // reassociate.
16169 if (Y->isDivergent())
16170 std::swap(Y, Z);
16171 // If both additions in the original were NUW, reassociation preserves that.
16172 SDNodeFlags ReassocFlags =
16173 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16174 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16175 DCI.AddToWorklist(UniformInner.getNode());
16176 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16177 }
16178
16179 return SDValue();
16180}
16181
16182SDValue SITargetLowering::performSubCombine(SDNode *N,
16183 DAGCombinerInfo &DCI) const {
16184 SelectionDAG &DAG = DCI.DAG;
16185 EVT VT = N->getValueType(0);
16186
16187 if (VT == MVT::i64) {
16188 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16189 return Folded;
16190 }
16191
16192 if (VT != MVT::i32)
16193 return SDValue();
16194
16195 SDLoc SL(N);
16196 SDValue LHS = N->getOperand(0);
16197 SDValue RHS = N->getOperand(1);
16198
16199 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16200 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16201 unsigned Opc = RHS.getOpcode();
16202 switch (Opc) {
16203 default:
16204 break;
16205 case ISD::ZERO_EXTEND:
16206 case ISD::SIGN_EXTEND:
16207 case ISD::ANY_EXTEND: {
16208 auto Cond = RHS.getOperand(0);
16209 // If this won't be a real VOPC output, we would still need to insert an
16210 // extra instruction anyway.
16211 if (!isBoolSGPR(Cond))
16212 break;
16213 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16214 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16216 return DAG.getNode(Opc, SL, VTList, Args);
16217 }
16218 }
16219
16220 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16221 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16222 if (!isNullConstant(LHS.getOperand(1)))
16223 return SDValue();
16224 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16225 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16226 }
16227 return SDValue();
16228}
16229
16230SDValue
16231SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16232 DAGCombinerInfo &DCI) const {
16233
16234 if (N->getValueType(0) != MVT::i32)
16235 return SDValue();
16236
16237 if (!isNullConstant(N->getOperand(1)))
16238 return SDValue();
16239
16240 SelectionDAG &DAG = DCI.DAG;
16241 SDValue LHS = N->getOperand(0);
16242
16243 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16244 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16245 unsigned LHSOpc = LHS.getOpcode();
16246 unsigned Opc = N->getOpcode();
16247 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16248 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16249 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16250 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16251 }
16252 return SDValue();
16253}
16254
16255SDValue SITargetLowering::performFAddCombine(SDNode *N,
16256 DAGCombinerInfo &DCI) const {
16257 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16258 return SDValue();
16259
16260 SelectionDAG &DAG = DCI.DAG;
16261 EVT VT = N->getValueType(0);
16262
16263 SDLoc SL(N);
16264 SDValue LHS = N->getOperand(0);
16265 SDValue RHS = N->getOperand(1);
16266
16267 // These should really be instruction patterns, but writing patterns with
16268 // source modifiers is a pain.
16269
16270 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16271 if (LHS.getOpcode() == ISD::FADD) {
16272 SDValue A = LHS.getOperand(0);
16273 if (A == LHS.getOperand(1)) {
16274 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16275 if (FusedOp != 0) {
16276 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16277 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16278 }
16279 }
16280 }
16281
16282 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16283 if (RHS.getOpcode() == ISD::FADD) {
16284 SDValue A = RHS.getOperand(0);
16285 if (A == RHS.getOperand(1)) {
16286 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16287 if (FusedOp != 0) {
16288 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16289 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16290 }
16291 }
16292 }
16293
16294 return SDValue();
16295}
16296
16297SDValue SITargetLowering::performFSubCombine(SDNode *N,
16298 DAGCombinerInfo &DCI) const {
16299 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16300 return SDValue();
16301
16302 SelectionDAG &DAG = DCI.DAG;
16303 SDLoc SL(N);
16304 EVT VT = N->getValueType(0);
16305 assert(!VT.isVector());
16306
16307 // Try to get the fneg to fold into the source modifier. This undoes generic
16308 // DAG combines and folds them into the mad.
16309 //
16310 // Only do this if we are not trying to support denormals. v_mad_f32 does
16311 // not support denormals ever.
16312 SDValue LHS = N->getOperand(0);
16313 SDValue RHS = N->getOperand(1);
16314 if (LHS.getOpcode() == ISD::FADD) {
16315 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16316 SDValue A = LHS.getOperand(0);
16317 if (A == LHS.getOperand(1)) {
16318 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16319 if (FusedOp != 0) {
16320 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16321 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16322
16323 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16324 }
16325 }
16326 }
16327
16328 if (RHS.getOpcode() == ISD::FADD) {
16329 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16330
16331 SDValue A = RHS.getOperand(0);
16332 if (A == RHS.getOperand(1)) {
16333 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16334 if (FusedOp != 0) {
16335 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16336 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16337 }
16338 }
16339 }
16340
16341 return SDValue();
16342}
16343
16344SDValue SITargetLowering::performFDivCombine(SDNode *N,
16345 DAGCombinerInfo &DCI) const {
16346 SelectionDAG &DAG = DCI.DAG;
16347 SDLoc SL(N);
16348 EVT VT = N->getValueType(0);
16349 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16350 return SDValue();
16351
16352 SDValue LHS = N->getOperand(0);
16353 SDValue RHS = N->getOperand(1);
16354
16355 SDNodeFlags Flags = N->getFlags();
16356 SDNodeFlags RHSFlags = RHS->getFlags();
16357 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16358 !RHS->hasOneUse())
16359 return SDValue();
16360
16361 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16362 bool IsNegative = false;
16363 if (CLHS->isExactlyValue(1.0) ||
16364 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16365 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16366 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16367 if (RHS.getOpcode() == ISD::FSQRT) {
16368 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16369 SDValue Rsq =
16370 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16371 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16372 }
16373 }
16374 }
16375
16376 return SDValue();
16377}
16378
16379SDValue SITargetLowering::performFMulCombine(SDNode *N,
16380 DAGCombinerInfo &DCI) const {
16381 SelectionDAG &DAG = DCI.DAG;
16382 EVT VT = N->getValueType(0);
16383 EVT ScalarVT = VT.getScalarType();
16384 EVT IntVT = VT.changeElementType(MVT::i32);
16385
16386 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16387 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16388 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16389 return SDValue();
16390 }
16391
16392 SDValue LHS = N->getOperand(0);
16393 SDValue RHS = N->getOperand(1);
16394
16395 // It is cheaper to realize i32 inline constants as compared against
16396 // materializing f16 or f64 (or even non-inline f32) values,
16397 // possible via ldexp usage, as shown below :
16398 //
16399 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16400 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16401 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16402 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16403 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16404 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16405 if (!TrueNode)
16406 return SDValue();
16407 const ConstantFPSDNode *FalseNode =
16408 isConstOrConstSplatFP(RHS.getOperand(2));
16409 if (!FalseNode)
16410 return SDValue();
16411
16412 if (TrueNode->isNegative() != FalseNode->isNegative())
16413 return SDValue();
16414
16415 // For f32, only non-inline constants should be transformed.
16416 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16417 if (ScalarVT == MVT::f32 &&
16418 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16419 TII->isInlineConstant(FalseNode->getValueAPF()))
16420 return SDValue();
16421
16422 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16423 if (TrueNodeExpVal == INT_MIN)
16424 return SDValue();
16425 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16426 if (FalseNodeExpVal == INT_MIN)
16427 return SDValue();
16428
16429 SDLoc SL(N);
16430 SDValue SelectNode =
16431 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16432 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16433 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16434
16435 LHS = TrueNode->isNegative()
16436 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16437 : LHS;
16438
16439 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16440 }
16441
16442 return SDValue();
16443}
16444
16445SDValue SITargetLowering::performFMACombine(SDNode *N,
16446 DAGCombinerInfo &DCI) const {
16447 SelectionDAG &DAG = DCI.DAG;
16448 EVT VT = N->getValueType(0);
16449 SDLoc SL(N);
16450
16451 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16452 return SDValue();
16453
16454 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16455 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16456 SDValue Op1 = N->getOperand(0);
16457 SDValue Op2 = N->getOperand(1);
16458 SDValue FMA = N->getOperand(2);
16459
16460 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16461 Op2.getOpcode() != ISD::FP_EXTEND)
16462 return SDValue();
16463
16464 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16465 // regardless of the denorm mode setting. Therefore,
16466 // fp-contract is sufficient to allow generating fdot2.
16467 const TargetOptions &Options = DAG.getTarget().Options;
16468 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16469 (N->getFlags().hasAllowContract() &&
16470 FMA->getFlags().hasAllowContract())) {
16471 Op1 = Op1.getOperand(0);
16472 Op2 = Op2.getOperand(0);
16473 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16475 return SDValue();
16476
16477 SDValue Vec1 = Op1.getOperand(0);
16478 SDValue Idx1 = Op1.getOperand(1);
16479 SDValue Vec2 = Op2.getOperand(0);
16480
16481 SDValue FMAOp1 = FMA.getOperand(0);
16482 SDValue FMAOp2 = FMA.getOperand(1);
16483 SDValue FMAAcc = FMA.getOperand(2);
16484
16485 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16486 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16487 return SDValue();
16488
16489 FMAOp1 = FMAOp1.getOperand(0);
16490 FMAOp2 = FMAOp2.getOperand(0);
16491 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16493 return SDValue();
16494
16495 SDValue Vec3 = FMAOp1.getOperand(0);
16496 SDValue Vec4 = FMAOp2.getOperand(0);
16497 SDValue Idx2 = FMAOp1.getOperand(1);
16498
16499 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16500 // Idx1 and Idx2 cannot be the same.
16501 Idx1 == Idx2)
16502 return SDValue();
16503
16504 if (Vec1 == Vec2 || Vec3 == Vec4)
16505 return SDValue();
16506
16507 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16508 return SDValue();
16509
16510 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16511 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16512 DAG.getTargetConstant(0, SL, MVT::i1));
16513 }
16514 }
16515 return SDValue();
16516}
16517
16518SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16519 DAGCombinerInfo &DCI) const {
16520 SelectionDAG &DAG = DCI.DAG;
16521 SDLoc SL(N);
16522
16523 SDValue LHS = N->getOperand(0);
16524 SDValue RHS = N->getOperand(1);
16525 EVT VT = LHS.getValueType();
16526 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16527
16528 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16529 if (!CRHS) {
16531 if (CRHS) {
16532 std::swap(LHS, RHS);
16533 CC = getSetCCSwappedOperands(CC);
16534 }
16535 }
16536
16537 if (CRHS) {
16538 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16539 isBoolSGPR(LHS.getOperand(0))) {
16540 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16541 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16542 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16543 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16544 if ((CRHS->isAllOnes() &&
16545 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16546 (CRHS->isZero() &&
16547 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16548 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16549 DAG.getAllOnesConstant(SL, MVT::i1));
16550 if ((CRHS->isAllOnes() &&
16551 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16552 (CRHS->isZero() &&
16553 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16554 return LHS.getOperand(0);
16555 }
16556
16557 const APInt &CRHSVal = CRHS->getAPIntValue();
16558 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16559 LHS.getOpcode() == ISD::SELECT &&
16560 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16561 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16562 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16563 isBoolSGPR(LHS.getOperand(0))) {
16564 // Given CT != FT:
16565 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16566 // setcc (select cc, CT, CF), CF, ne => cc
16567 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16568 // setcc (select cc, CT, CF), CT, eq => cc
16569 const APInt &CT = LHS.getConstantOperandAPInt(1);
16570 const APInt &CF = LHS.getConstantOperandAPInt(2);
16571
16572 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16573 (CT == CRHSVal && CC == ISD::SETNE))
16574 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16575 DAG.getAllOnesConstant(SL, MVT::i1));
16576 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16577 (CT == CRHSVal && CC == ISD::SETEQ))
16578 return LHS.getOperand(0);
16579 }
16580 }
16581
16582 if (VT != MVT::f32 && VT != MVT::f64 &&
16583 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16584 return SDValue();
16585
16586 // Match isinf/isfinite pattern
16587 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16588 // (fcmp one (fabs x), inf) -> (fp_class x,
16589 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16590 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16591 LHS.getOpcode() == ISD::FABS) {
16592 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16593 if (!CRHS)
16594 return SDValue();
16595
16596 const APFloat &APF = CRHS->getValueAPF();
16597 if (APF.isInfinity() && !APF.isNegative()) {
16598 const unsigned IsInfMask =
16600 const unsigned IsFiniteMask =
16604 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16605 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16606 DAG.getConstant(Mask, SL, MVT::i32));
16607 }
16608 }
16609
16610 return SDValue();
16611}
16612
16613SDValue
16614SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16615 DAGCombinerInfo &DCI) const {
16616 SelectionDAG &DAG = DCI.DAG;
16617 SDLoc SL(N);
16618 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16619
16620 SDValue Src = N->getOperand(0);
16621 SDValue Shift = N->getOperand(0);
16622
16623 // TODO: Extend type shouldn't matter (assuming legal types).
16624 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16625 Shift = Shift.getOperand(0);
16626
16627 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16628 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16629 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16630 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16631 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16632 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16633 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16634 SDValue Shifted = DAG.getZExtOrTrunc(
16635 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16636
16637 unsigned ShiftOffset = 8 * Offset;
16638 if (Shift.getOpcode() == ISD::SHL)
16639 ShiftOffset -= C->getZExtValue();
16640 else
16641 ShiftOffset += C->getZExtValue();
16642
16643 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16644 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16645 MVT::f32, Shifted);
16646 }
16647 }
16648 }
16649
16650 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16651 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16652 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16653 // We simplified Src. If this node is not dead, visit it again so it is
16654 // folded properly.
16655 if (N->getOpcode() != ISD::DELETED_NODE)
16656 DCI.AddToWorklist(N);
16657 return SDValue(N, 0);
16658 }
16659
16660 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16661 if (SDValue DemandedSrc =
16662 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16663 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16664
16665 return SDValue();
16666}
16667
16668SDValue SITargetLowering::performClampCombine(SDNode *N,
16669 DAGCombinerInfo &DCI) const {
16670 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16671 if (!CSrc)
16672 return SDValue();
16673
16674 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16675 const APFloat &F = CSrc->getValueAPF();
16676 APFloat Zero = APFloat::getZero(F.getSemantics());
16677 if (F < Zero ||
16678 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16679 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16680 }
16681
16682 APFloat One(F.getSemantics(), "1.0");
16683 if (F > One)
16684 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16685
16686 return SDValue(CSrc, 0);
16687}
16688
16689SDValue SITargetLowering::performSelectCombine(SDNode *N,
16690 DAGCombinerInfo &DCI) const {
16691
16692 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16693 // integer).
16694 // Detect when CMP and SELECT use the same constant and fold them to avoid
16695 // loading the constant twice. Specifically handles patterns like:
16696 // %cmp = icmp eq i32 %val, 4242
16697 // %sel = select i1 %cmp, i32 4242, i32 %other
16698 // It can be optimized to reuse %val instead of 4242 in select.
16699 SDValue Cond = N->getOperand(0);
16700 SDValue TrueVal = N->getOperand(1);
16701 SDValue FalseVal = N->getOperand(2);
16702
16703 // Check if condition is a comparison.
16704 if (Cond.getOpcode() != ISD::SETCC)
16705 return SDValue();
16706
16707 SDValue LHS = Cond.getOperand(0);
16708 SDValue RHS = Cond.getOperand(1);
16709 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16710
16711 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16712 bool isInteger = LHS.getValueType().isInteger();
16713
16714 // Handle simple floating-point and integer types only.
16715 if (!isFloatingPoint && !isInteger)
16716 return SDValue();
16717
16718 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16719 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16720 if (!isEquality && !isNonEquality)
16721 return SDValue();
16722
16723 SDValue ArgVal, ConstVal;
16724 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16725 (isInteger && isa<ConstantSDNode>(RHS))) {
16726 ConstVal = RHS;
16727 ArgVal = LHS;
16728 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16729 (isInteger && isa<ConstantSDNode>(LHS))) {
16730 ConstVal = LHS;
16731 ArgVal = RHS;
16732 } else {
16733 return SDValue();
16734 }
16735
16736 // Skip optimization for inlinable immediates.
16737 if (isFloatingPoint) {
16738 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16739 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16740 return SDValue();
16741 } else {
16743 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16744 return SDValue();
16745 }
16746
16747 // For equality and non-equality comparisons, patterns:
16748 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16749 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16750 if (!(isEquality && TrueVal == ConstVal) &&
16751 !(isNonEquality && FalseVal == ConstVal))
16752 return SDValue();
16753
16754 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16755 SDValue SelectRHS =
16756 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16757 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16758 SelectLHS, SelectRHS);
16759}
16760
16762 DAGCombinerInfo &DCI) const {
16763 switch (N->getOpcode()) {
16764 case ISD::ADD:
16765 case ISD::SUB:
16766 case ISD::SHL:
16767 case ISD::SRL:
16768 case ISD::SRA:
16769 case ISD::AND:
16770 case ISD::OR:
16771 case ISD::XOR:
16772 case ISD::MUL:
16773 case ISD::SETCC:
16774 case ISD::SELECT:
16775 case ISD::SMIN:
16776 case ISD::SMAX:
16777 case ISD::UMIN:
16778 case ISD::UMAX:
16779 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16780 return Res;
16781 break;
16782 default:
16783 break;
16784 }
16785
16786 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16787 return SDValue();
16788
16789 switch (N->getOpcode()) {
16790 case ISD::ADD:
16791 return performAddCombine(N, DCI);
16792 case ISD::PTRADD:
16793 return performPtrAddCombine(N, DCI);
16794 case ISD::SUB:
16795 return performSubCombine(N, DCI);
16796 case ISD::UADDO_CARRY:
16797 case ISD::USUBO_CARRY:
16798 return performAddCarrySubCarryCombine(N, DCI);
16799 case ISD::FADD:
16800 return performFAddCombine(N, DCI);
16801 case ISD::FSUB:
16802 return performFSubCombine(N, DCI);
16803 case ISD::FDIV:
16804 return performFDivCombine(N, DCI);
16805 case ISD::FMUL:
16806 return performFMulCombine(N, DCI);
16807 case ISD::SETCC:
16808 return performSetCCCombine(N, DCI);
16809 case ISD::SELECT:
16810 if (auto Res = performSelectCombine(N, DCI))
16811 return Res;
16812 break;
16813 case ISD::FMAXNUM:
16814 case ISD::FMINNUM:
16815 case ISD::FMAXNUM_IEEE:
16816 case ISD::FMINNUM_IEEE:
16817 case ISD::FMAXIMUM:
16818 case ISD::FMINIMUM:
16819 case ISD::FMAXIMUMNUM:
16820 case ISD::FMINIMUMNUM:
16821 case ISD::SMAX:
16822 case ISD::SMIN:
16823 case ISD::UMAX:
16824 case ISD::UMIN:
16827 return performMinMaxCombine(N, DCI);
16828 case ISD::FMA:
16829 return performFMACombine(N, DCI);
16830 case ISD::AND:
16831 return performAndCombine(N, DCI);
16832 case ISD::OR:
16833 return performOrCombine(N, DCI);
16834 case ISD::FSHR: {
16836 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16837 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16838 return matchPERM(N, DCI);
16839 }
16840 break;
16841 }
16842 case ISD::XOR:
16843 return performXorCombine(N, DCI);
16844 case ISD::ZERO_EXTEND:
16845 return performZeroExtendCombine(N, DCI);
16847 return performSignExtendInRegCombine(N, DCI);
16849 return performClassCombine(N, DCI);
16850 case ISD::FCANONICALIZE:
16851 return performFCanonicalizeCombine(N, DCI);
16852 case AMDGPUISD::RCP:
16853 return performRcpCombine(N, DCI);
16854 case ISD::FLDEXP:
16855 case AMDGPUISD::FRACT:
16856 case AMDGPUISD::RSQ:
16859 case AMDGPUISD::RSQ_CLAMP: {
16860 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16861 SDValue Src = N->getOperand(0);
16862 if (Src.isUndef())
16863 return Src;
16864 break;
16865 }
16866 case ISD::SINT_TO_FP:
16867 case ISD::UINT_TO_FP:
16868 return performUCharToFloatCombine(N, DCI);
16869 case ISD::FCOPYSIGN:
16870 return performFCopySignCombine(N, DCI);
16875 return performCvtF32UByteNCombine(N, DCI);
16876 case AMDGPUISD::FMED3:
16877 return performFMed3Combine(N, DCI);
16879 return performCvtPkRTZCombine(N, DCI);
16880 case AMDGPUISD::CLAMP:
16881 return performClampCombine(N, DCI);
16882 case ISD::SCALAR_TO_VECTOR: {
16883 SelectionDAG &DAG = DCI.DAG;
16884 EVT VT = N->getValueType(0);
16885
16886 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16887 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16888 SDLoc SL(N);
16889 SDValue Src = N->getOperand(0);
16890 EVT EltVT = Src.getValueType();
16891 if (EltVT != MVT::i16)
16892 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16893
16894 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16895 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16896 }
16897
16898 break;
16899 }
16901 return performExtractVectorEltCombine(N, DCI);
16903 return performInsertVectorEltCombine(N, DCI);
16904 case ISD::FP_ROUND:
16905 return performFPRoundCombine(N, DCI);
16906 case ISD::LOAD: {
16907 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16908 return Widened;
16909 [[fallthrough]];
16910 }
16911 default: {
16912 if (!DCI.isBeforeLegalize()) {
16913 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16914 return performMemSDNodeCombine(MemNode, DCI);
16915 }
16916
16917 break;
16918 }
16919 }
16920
16922}
16923
16924/// Helper function for adjustWritemask
16925static unsigned SubIdx2Lane(unsigned Idx) {
16926 switch (Idx) {
16927 default:
16928 return ~0u;
16929 case AMDGPU::sub0:
16930 return 0;
16931 case AMDGPU::sub1:
16932 return 1;
16933 case AMDGPU::sub2:
16934 return 2;
16935 case AMDGPU::sub3:
16936 return 3;
16937 case AMDGPU::sub4:
16938 return 4; // Possible with TFE/LWE
16939 }
16940}
16941
16942/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16943SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16944 SelectionDAG &DAG) const {
16945 unsigned Opcode = Node->getMachineOpcode();
16946
16947 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16948 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16949 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16950 return Node; // not implemented for D16
16951
16952 SDNode *Users[5] = {nullptr};
16953 unsigned Lane = 0;
16954 unsigned DmaskIdx =
16955 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16956 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16957 unsigned NewDmask = 0;
16958 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16959 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16960 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16961 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16962 unsigned TFCLane = 0;
16963 bool HasChain = Node->getNumValues() > 1;
16964
16965 if (OldDmask == 0) {
16966 // These are folded out, but on the chance it happens don't assert.
16967 return Node;
16968 }
16969
16970 unsigned OldBitsSet = llvm::popcount(OldDmask);
16971 // Work out which is the TFE/LWE lane if that is enabled.
16972 if (UsesTFC) {
16973 TFCLane = OldBitsSet;
16974 }
16975
16976 // Try to figure out the used register components
16977 for (SDUse &Use : Node->uses()) {
16978
16979 // Don't look at users of the chain.
16980 if (Use.getResNo() != 0)
16981 continue;
16982
16983 SDNode *User = Use.getUser();
16984
16985 // Abort if we can't understand the usage
16986 if (!User->isMachineOpcode() ||
16987 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16988 return Node;
16989
16990 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16991 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16992 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16993 // set, etc.
16994 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16995 if (Lane == ~0u)
16996 return Node;
16997
16998 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16999 if (UsesTFC && Lane == TFCLane) {
17000 Users[Lane] = User;
17001 } else {
17002 // Set which texture component corresponds to the lane.
17003 unsigned Comp;
17004 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17005 Comp = llvm::countr_zero(Dmask);
17006 Dmask &= ~(1 << Comp);
17007 }
17008
17009 // Abort if we have more than one user per component.
17010 if (Users[Lane])
17011 return Node;
17012
17013 Users[Lane] = User;
17014 NewDmask |= 1 << Comp;
17015 }
17016 }
17017
17018 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17019 bool NoChannels = !NewDmask;
17020 if (NoChannels) {
17021 if (!UsesTFC) {
17022 // No uses of the result and not using TFC. Then do nothing.
17023 return Node;
17024 }
17025 // If the original dmask has one channel - then nothing to do
17026 if (OldBitsSet == 1)
17027 return Node;
17028 // Use an arbitrary dmask - required for the instruction to work
17029 NewDmask = 1;
17030 }
17031 // Abort if there's no change
17032 if (NewDmask == OldDmask)
17033 return Node;
17034
17035 unsigned BitsSet = llvm::popcount(NewDmask);
17036
17037 // Check for TFE or LWE - increase the number of channels by one to account
17038 // for the extra return value
17039 // This will need adjustment for D16 if this is also included in
17040 // adjustWriteMask (this function) but at present D16 are excluded.
17041 unsigned NewChannels = BitsSet + UsesTFC;
17042
17043 int NewOpcode =
17044 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17045 assert(NewOpcode != -1 &&
17046 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17047 "failed to find equivalent MIMG op");
17048
17049 // Adjust the writemask in the node
17051 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17052 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17053 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17054
17055 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17056
17057 MVT ResultVT = NewChannels == 1
17058 ? SVT
17059 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17060 : NewChannels == 5 ? 8
17061 : NewChannels);
17062 SDVTList NewVTList =
17063 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17064
17065 MachineSDNode *NewNode =
17066 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17067
17068 if (HasChain) {
17069 // Update chain.
17070 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17071 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17072 }
17073
17074 if (NewChannels == 1) {
17075 assert(Node->hasNUsesOfValue(1, 0));
17076 SDNode *Copy =
17077 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17078 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17079 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17080 return nullptr;
17081 }
17082
17083 // Update the users of the node with the new indices
17084 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17085 SDNode *User = Users[i];
17086 if (!User) {
17087 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17088 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17089 if (i || !NoChannels)
17090 continue;
17091 } else {
17092 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17093 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17094 if (NewUser != User) {
17095 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17096 DAG.RemoveDeadNode(User);
17097 }
17098 }
17099
17100 switch (Idx) {
17101 default:
17102 break;
17103 case AMDGPU::sub0:
17104 Idx = AMDGPU::sub1;
17105 break;
17106 case AMDGPU::sub1:
17107 Idx = AMDGPU::sub2;
17108 break;
17109 case AMDGPU::sub2:
17110 Idx = AMDGPU::sub3;
17111 break;
17112 case AMDGPU::sub3:
17113 Idx = AMDGPU::sub4;
17114 break;
17115 }
17116 }
17117
17118 DAG.RemoveDeadNode(Node);
17119 return nullptr;
17120}
17121
17123 if (Op.getOpcode() == ISD::AssertZext)
17124 Op = Op.getOperand(0);
17125
17126 return isa<FrameIndexSDNode>(Op);
17127}
17128
17129/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17130/// with frame index operands.
17131/// LLVM assumes that inputs are to these instructions are registers.
17132SDNode *
17134 SelectionDAG &DAG) const {
17135 if (Node->getOpcode() == ISD::CopyToReg) {
17136 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17137 SDValue SrcVal = Node->getOperand(2);
17138
17139 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17140 // to try understanding copies to physical registers.
17141 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17142 SDLoc SL(Node);
17144 SDValue VReg = DAG.getRegister(
17145 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17146
17147 SDNode *Glued = Node->getGluedNode();
17148 SDValue ToVReg = DAG.getCopyToReg(
17149 Node->getOperand(0), SL, VReg, SrcVal,
17150 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17151 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17152 VReg, ToVReg.getValue(1));
17153 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17154 DAG.RemoveDeadNode(Node);
17155 return ToResultReg.getNode();
17156 }
17157 }
17158
17160 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17161 if (!isFrameIndexOp(Node->getOperand(i))) {
17162 Ops.push_back(Node->getOperand(i));
17163 continue;
17164 }
17165
17166 SDLoc DL(Node);
17167 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17168 Node->getOperand(i).getValueType(),
17169 Node->getOperand(i)),
17170 0));
17171 }
17172
17173 return DAG.UpdateNodeOperands(Node, Ops);
17174}
17175
17176/// Fold the instructions after selecting them.
17177/// Returns null if users were already updated.
17179 SelectionDAG &DAG) const {
17181 unsigned Opcode = Node->getMachineOpcode();
17182
17183 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17184 !TII->isGather4(Opcode) &&
17185 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17186 return adjustWritemask(Node, DAG);
17187 }
17188
17189 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17191 return Node;
17192 }
17193
17194 switch (Opcode) {
17195 case AMDGPU::V_DIV_SCALE_F32_e64:
17196 case AMDGPU::V_DIV_SCALE_F64_e64: {
17197 // Satisfy the operand register constraint when one of the inputs is
17198 // undefined. Ordinarily each undef value will have its own implicit_def of
17199 // a vreg, so force these to use a single register.
17200 SDValue Src0 = Node->getOperand(1);
17201 SDValue Src1 = Node->getOperand(3);
17202 SDValue Src2 = Node->getOperand(5);
17203
17204 if ((Src0.isMachineOpcode() &&
17205 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17206 (Src0 == Src1 || Src0 == Src2))
17207 break;
17208
17209 MVT VT = Src0.getValueType().getSimpleVT();
17210 const TargetRegisterClass *RC =
17211 getRegClassFor(VT, Src0.getNode()->isDivergent());
17212
17214 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17215
17216 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17217 Src0, SDValue());
17218
17219 // src0 must be the same register as src1 or src2, even if the value is
17220 // undefined, so make sure we don't violate this constraint.
17221 if (Src0.isMachineOpcode() &&
17222 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17223 if (Src1.isMachineOpcode() &&
17224 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17225 Src0 = Src1;
17226 else if (Src2.isMachineOpcode() &&
17227 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17228 Src0 = Src2;
17229 else {
17230 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17231 Src0 = UndefReg;
17232 Src1 = UndefReg;
17233 }
17234 } else
17235 break;
17236
17238 Ops[1] = Src0;
17239 Ops[3] = Src1;
17240 Ops[5] = Src2;
17241 Ops.push_back(ImpDef.getValue(1));
17242 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17243 }
17244 default:
17245 break;
17246 }
17247
17248 return Node;
17249}
17250
17251// Any MIMG instructions that use tfe or lwe require an initialization of the
17252// result register that will be written in the case of a memory access failure.
17253// The required code is also added to tie this init code to the result of the
17254// img instruction.
17257 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17258 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17259 MachineBasicBlock &MBB = *MI.getParent();
17260
17261 int DstIdx =
17262 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17263 unsigned InitIdx = 0;
17264
17265 if (TII->isImage(MI)) {
17266 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17267 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17268 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17269
17270 if (!TFE && !LWE) // intersect_ray
17271 return;
17272
17273 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17274 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17275 unsigned D16Val = D16 ? D16->getImm() : 0;
17276
17277 if (!TFEVal && !LWEVal)
17278 return;
17279
17280 // At least one of TFE or LWE are non-zero
17281 // We have to insert a suitable initialization of the result value and
17282 // tie this to the dest of the image instruction.
17283
17284 // Calculate which dword we have to initialize to 0.
17285 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17286
17287 // check that dmask operand is found.
17288 assert(MO_Dmask && "Expected dmask operand in instruction");
17289
17290 unsigned dmask = MO_Dmask->getImm();
17291 // Determine the number of active lanes taking into account the
17292 // Gather4 special case
17293 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17294
17295 bool Packed = !Subtarget->hasUnpackedD16VMem();
17296
17297 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17298
17299 // Abandon attempt if the dst size isn't large enough
17300 // - this is in fact an error but this is picked up elsewhere and
17301 // reported correctly.
17302 uint32_t DstSize =
17303 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17304 if (DstSize < InitIdx)
17305 return;
17306 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17307 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17308 } else {
17309 return;
17310 }
17311
17312 const DebugLoc &DL = MI.getDebugLoc();
17313
17314 // Create a register for the initialization value.
17315 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17316 unsigned NewDst = 0; // Final initialized value will be in here
17317
17318 // If PRTStrictNull feature is enabled (the default) then initialize
17319 // all the result registers to 0, otherwise just the error indication
17320 // register (VGPRn+1)
17321 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17322 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17323
17324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17325 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17326 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17327 // Initialize dword
17328 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17329 // clang-format off
17330 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17331 .addImm(0);
17332 // clang-format on
17333 // Insert into the super-reg
17334 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17335 .addReg(PrevDst)
17336 .addReg(SubReg)
17338
17339 PrevDst = NewDst;
17340 }
17341
17342 // Add as an implicit operand
17343 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17344
17345 // Tie the just added implicit operand to the dst
17346 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17347}
17348
17349/// Assign the register class depending on the number of
17350/// bits set in the writemask
17352 SDNode *Node) const {
17354
17355 MachineFunction *MF = MI.getParent()->getParent();
17358
17359 if (TII->isVOP3(MI.getOpcode())) {
17360 // Make sure constant bus requirements are respected.
17361 TII->legalizeOperandsVOP3(MRI, MI);
17362
17363 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17364 // This saves a chain-copy of registers and better balance register
17365 // use between vgpr and agpr as agpr tuples tend to be big.
17366 if (!MI.getDesc().operands().empty()) {
17367 unsigned Opc = MI.getOpcode();
17368 bool HasAGPRs = Info->mayNeedAGPRs();
17369 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17370 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17371 for (auto I :
17372 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17373 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17374 if (I == -1)
17375 break;
17376 if ((I == Src2Idx) && (HasAGPRs))
17377 break;
17378 MachineOperand &Op = MI.getOperand(I);
17379 if (!Op.isReg() || !Op.getReg().isVirtual())
17380 continue;
17381 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17382 if (!TRI->hasAGPRs(RC))
17383 continue;
17384 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17385 if (!Src || !Src->isCopy() ||
17386 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17387 continue;
17388 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17389 // All uses of agpr64 and agpr32 can also accept vgpr except for
17390 // v_accvgpr_read, but we do not produce agpr reads during selection,
17391 // so no use checks are needed.
17392 MRI.setRegClass(Op.getReg(), NewRC);
17393 }
17394
17395 if (TII->isMAI(MI)) {
17396 // The ordinary src0, src1, src2 were legalized above.
17397 //
17398 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17399 // as a separate instruction.
17400 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17401 AMDGPU::OpName::scale_src0);
17402 if (Src0Idx != -1) {
17403 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17404 AMDGPU::OpName::scale_src1);
17405 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17406 TII->usesConstantBus(MRI, MI, Src1Idx))
17407 TII->legalizeOpWithMove(MI, Src1Idx);
17408 }
17409 }
17410
17411 if (!HasAGPRs)
17412 return;
17413
17414 // Resolve the rest of AV operands to AGPRs.
17415 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17416 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17417 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17418 if (TRI->isVectorSuperClass(RC)) {
17419 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17420 MRI.setRegClass(Src2->getReg(), NewRC);
17421 if (Src2->isTied())
17422 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17423 }
17424 }
17425 }
17426 }
17427
17428 return;
17429 }
17430
17431 if (TII->isImage(MI))
17432 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17433}
17434
17436 uint64_t Val) {
17437 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17438 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17439}
17440
17442 const SDLoc &DL,
17443 SDValue Ptr) const {
17445
17446 // Build the half of the subregister with the constants before building the
17447 // full 128-bit register. If we are building multiple resource descriptors,
17448 // this will allow CSEing of the 2-component register.
17449 const SDValue Ops0[] = {
17450 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17451 buildSMovImm32(DAG, DL, 0),
17452 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17453 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17454 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17455
17456 SDValue SubRegHi = SDValue(
17457 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17458
17459 // Combine the constants and the pointer.
17460 const SDValue Ops1[] = {
17461 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17462 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17463 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17464
17465 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17466}
17467
17468/// Return a resource descriptor with the 'Add TID' bit enabled
17469/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17470/// of the resource descriptor) to create an offset, which is added to
17471/// the resource pointer.
17473 SDValue Ptr, uint32_t RsrcDword1,
17474 uint64_t RsrcDword2And3) const {
17475 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17476 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17477 if (RsrcDword1) {
17478 PtrHi =
17479 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17480 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17481 0);
17482 }
17483
17484 SDValue DataLo =
17485 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17486 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17487
17488 const SDValue Ops[] = {
17489 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17490 PtrLo,
17491 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17492 PtrHi,
17493 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17494 DataLo,
17495 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17496 DataHi,
17497 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17498
17499 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17500}
17501
17502//===----------------------------------------------------------------------===//
17503// SI Inline Assembly Support
17504//===----------------------------------------------------------------------===//
17505
17506std::pair<unsigned, const TargetRegisterClass *>
17508 StringRef Constraint,
17509 MVT VT) const {
17510 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17511
17512 const TargetRegisterClass *RC = nullptr;
17513 if (Constraint.size() == 1) {
17514 // Check if we cannot determine the bit size of the given value type. This
17515 // can happen, for example, in this situation where we have an empty struct
17516 // (size 0): `call void asm "", "v"({} poison)`-
17517 if (VT == MVT::Other)
17518 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17519 const unsigned BitWidth = VT.getSizeInBits();
17520 switch (Constraint[0]) {
17521 default:
17522 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17523 case 's':
17524 case 'r':
17525 switch (BitWidth) {
17526 case 16:
17527 RC = &AMDGPU::SReg_32RegClass;
17528 break;
17529 case 64:
17530 RC = &AMDGPU::SGPR_64RegClass;
17531 break;
17532 default:
17534 if (!RC)
17535 return std::pair(0U, nullptr);
17536 break;
17537 }
17538 break;
17539 case 'v':
17540 switch (BitWidth) {
17541 case 16:
17542 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17543 : &AMDGPU::VGPR_32_Lo256RegClass;
17544 break;
17545 default:
17546 RC = Subtarget->has1024AddressableVGPRs()
17547 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17548 : TRI->getVGPRClassForBitWidth(BitWidth);
17549 if (!RC)
17550 return std::pair(0U, nullptr);
17551 break;
17552 }
17553 break;
17554 case 'a':
17555 if (!Subtarget->hasMAIInsts())
17556 break;
17557 switch (BitWidth) {
17558 case 16:
17559 RC = &AMDGPU::AGPR_32RegClass;
17560 break;
17561 default:
17562 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17563 if (!RC)
17564 return std::pair(0U, nullptr);
17565 break;
17566 }
17567 break;
17568 }
17569 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17570 const unsigned BitWidth = VT.getSizeInBits();
17571 switch (BitWidth) {
17572 case 16:
17573 RC = &AMDGPU::AV_32RegClass;
17574 break;
17575 default:
17576 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17577 if (!RC)
17578 return std::pair(0U, nullptr);
17579 break;
17580 }
17581 }
17582
17583 // We actually support i128, i16 and f16 as inline parameters
17584 // even if they are not reported as legal
17585 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17586 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17587 return std::pair(0U, RC);
17588
17589 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17590 if (Kind != '\0') {
17591 if (Kind == 'v') {
17592 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17593 } else if (Kind == 's') {
17594 RC = &AMDGPU::SGPR_32RegClass;
17595 } else if (Kind == 'a') {
17596 RC = &AMDGPU::AGPR_32RegClass;
17597 }
17598
17599 if (RC) {
17600 if (NumRegs > 1) {
17601 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17602 return std::pair(0U, nullptr);
17603
17604 uint32_t Width = NumRegs * 32;
17605 // Prohibit constraints for register ranges with a width that does not
17606 // match the required type.
17607 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17608 return std::pair(0U, nullptr);
17609
17610 MCRegister Reg = RC->getRegister(Idx);
17612 RC = TRI->getVGPRClassForBitWidth(Width);
17613 else if (SIRegisterInfo::isSGPRClass(RC))
17614 RC = TRI->getSGPRClassForBitWidth(Width);
17615 else if (SIRegisterInfo::isAGPRClass(RC))
17616 RC = TRI->getAGPRClassForBitWidth(Width);
17617 if (RC) {
17618 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17619 if (!Reg) {
17620 // The register class does not contain the requested register,
17621 // e.g., because it is an SGPR pair that would violate alignment
17622 // requirements.
17623 return std::pair(0U, nullptr);
17624 }
17625 return std::pair(Reg, RC);
17626 }
17627 }
17628
17629 // Check for lossy scalar/vector conversions.
17630 if (VT.isVector() && VT.getSizeInBits() != 32)
17631 return std::pair(0U, nullptr);
17632 if (Idx < RC->getNumRegs())
17633 return std::pair(RC->getRegister(Idx), RC);
17634 return std::pair(0U, nullptr);
17635 }
17636 }
17637
17638 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17639 if (Ret.first)
17640 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17641
17642 return Ret;
17643}
17644
17645static bool isImmConstraint(StringRef Constraint) {
17646 if (Constraint.size() == 1) {
17647 switch (Constraint[0]) {
17648 default:
17649 break;
17650 case 'I':
17651 case 'J':
17652 case 'A':
17653 case 'B':
17654 case 'C':
17655 return true;
17656 }
17657 } else if (Constraint == "DA" || Constraint == "DB") {
17658 return true;
17659 }
17660 return false;
17661}
17662
17665 if (Constraint.size() == 1) {
17666 switch (Constraint[0]) {
17667 default:
17668 break;
17669 case 's':
17670 case 'v':
17671 case 'a':
17672 return C_RegisterClass;
17673 }
17674 } else if (Constraint.size() == 2) {
17675 if (Constraint == "VA")
17676 return C_RegisterClass;
17677 }
17678 if (isImmConstraint(Constraint)) {
17679 return C_Other;
17680 }
17681 return TargetLowering::getConstraintType(Constraint);
17682}
17683
17684static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17686 Val = Val & maskTrailingOnes<uint64_t>(Size);
17687 }
17688 return Val;
17689}
17690
17692 StringRef Constraint,
17693 std::vector<SDValue> &Ops,
17694 SelectionDAG &DAG) const {
17695 if (isImmConstraint(Constraint)) {
17696 uint64_t Val;
17697 if (getAsmOperandConstVal(Op, Val) &&
17698 checkAsmConstraintVal(Op, Constraint, Val)) {
17699 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17700 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17701 }
17702 } else {
17704 }
17705}
17706
17708 unsigned Size = Op.getScalarValueSizeInBits();
17709 if (Size > 64)
17710 return false;
17711
17712 if (Size == 16 && !Subtarget->has16BitInsts())
17713 return false;
17714
17716 Val = C->getSExtValue();
17717 return true;
17718 }
17720 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17721 return true;
17722 }
17724 if (Size != 16 || Op.getNumOperands() != 2)
17725 return false;
17726 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17727 return false;
17728 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17729 Val = C->getSExtValue();
17730 return true;
17731 }
17732 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17733 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17734 return true;
17735 }
17736 }
17737
17738 return false;
17739}
17740
17742 uint64_t Val) const {
17743 if (Constraint.size() == 1) {
17744 switch (Constraint[0]) {
17745 case 'I':
17747 case 'J':
17748 return isInt<16>(Val);
17749 case 'A':
17750 return checkAsmConstraintValA(Op, Val);
17751 case 'B':
17752 return isInt<32>(Val);
17753 case 'C':
17754 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17756 default:
17757 break;
17758 }
17759 } else if (Constraint.size() == 2) {
17760 if (Constraint == "DA") {
17761 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17762 int64_t LoBits = static_cast<int32_t>(Val);
17763 return checkAsmConstraintValA(Op, HiBits, 32) &&
17764 checkAsmConstraintValA(Op, LoBits, 32);
17765 }
17766 if (Constraint == "DB") {
17767 return true;
17768 }
17769 }
17770 llvm_unreachable("Invalid asm constraint");
17771}
17772
17774 unsigned MaxSize) const {
17775 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17776 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17777 if (Size == 16) {
17778 MVT VT = Op.getSimpleValueType();
17779 switch (VT.SimpleTy) {
17780 default:
17781 return false;
17782 case MVT::i16:
17783 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17784 case MVT::f16:
17785 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17786 case MVT::bf16:
17787 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17788 case MVT::v2i16:
17789 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17790 case MVT::v2f16:
17791 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17792 case MVT::v2bf16:
17793 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17794 }
17795 }
17796 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17797 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17798 return true;
17799 return false;
17800}
17801
17802static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17803 switch (UnalignedClassID) {
17804 case AMDGPU::VReg_64RegClassID:
17805 return AMDGPU::VReg_64_Align2RegClassID;
17806 case AMDGPU::VReg_96RegClassID:
17807 return AMDGPU::VReg_96_Align2RegClassID;
17808 case AMDGPU::VReg_128RegClassID:
17809 return AMDGPU::VReg_128_Align2RegClassID;
17810 case AMDGPU::VReg_160RegClassID:
17811 return AMDGPU::VReg_160_Align2RegClassID;
17812 case AMDGPU::VReg_192RegClassID:
17813 return AMDGPU::VReg_192_Align2RegClassID;
17814 case AMDGPU::VReg_224RegClassID:
17815 return AMDGPU::VReg_224_Align2RegClassID;
17816 case AMDGPU::VReg_256RegClassID:
17817 return AMDGPU::VReg_256_Align2RegClassID;
17818 case AMDGPU::VReg_288RegClassID:
17819 return AMDGPU::VReg_288_Align2RegClassID;
17820 case AMDGPU::VReg_320RegClassID:
17821 return AMDGPU::VReg_320_Align2RegClassID;
17822 case AMDGPU::VReg_352RegClassID:
17823 return AMDGPU::VReg_352_Align2RegClassID;
17824 case AMDGPU::VReg_384RegClassID:
17825 return AMDGPU::VReg_384_Align2RegClassID;
17826 case AMDGPU::VReg_512RegClassID:
17827 return AMDGPU::VReg_512_Align2RegClassID;
17828 case AMDGPU::VReg_1024RegClassID:
17829 return AMDGPU::VReg_1024_Align2RegClassID;
17830 case AMDGPU::AReg_64RegClassID:
17831 return AMDGPU::AReg_64_Align2RegClassID;
17832 case AMDGPU::AReg_96RegClassID:
17833 return AMDGPU::AReg_96_Align2RegClassID;
17834 case AMDGPU::AReg_128RegClassID:
17835 return AMDGPU::AReg_128_Align2RegClassID;
17836 case AMDGPU::AReg_160RegClassID:
17837 return AMDGPU::AReg_160_Align2RegClassID;
17838 case AMDGPU::AReg_192RegClassID:
17839 return AMDGPU::AReg_192_Align2RegClassID;
17840 case AMDGPU::AReg_256RegClassID:
17841 return AMDGPU::AReg_256_Align2RegClassID;
17842 case AMDGPU::AReg_512RegClassID:
17843 return AMDGPU::AReg_512_Align2RegClassID;
17844 case AMDGPU::AReg_1024RegClassID:
17845 return AMDGPU::AReg_1024_Align2RegClassID;
17846 default:
17847 return -1;
17848 }
17849}
17850
17851// Figure out which registers should be reserved for stack access. Only after
17852// the function is legalized do we know all of the non-spill stack objects or if
17853// calls are present.
17857 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17858 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17859 const SIInstrInfo *TII = ST.getInstrInfo();
17860
17861 if (Info->isEntryFunction()) {
17862 // Callable functions have fixed registers used for stack access.
17864 }
17865
17866 // TODO: Move this logic to getReservedRegs()
17867 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17868 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17869 Register SReg = ST.isWave32()
17870 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17871 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17872 &AMDGPU::SGPR_64RegClass);
17873 Info->setSGPRForEXECCopy(SReg);
17874
17875 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17876 Info->getStackPtrOffsetReg()));
17877 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17878 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17879
17880 // We need to worry about replacing the default register with itself in case
17881 // of MIR testcases missing the MFI.
17882 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17883 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17884
17885 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17886 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17887
17888 Info->limitOccupancy(MF);
17889
17890 if (ST.isWave32() && !MF.empty()) {
17891 for (auto &MBB : MF) {
17892 for (auto &MI : MBB) {
17893 TII->fixImplicitOperands(MI);
17894 }
17895 }
17896 }
17897
17898 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17899 // classes if required. Ideally the register class constraints would differ
17900 // per-subtarget, but there's no easy way to achieve that right now. This is
17901 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17902 // from using them as the register class for legal types.
17903 if (ST.needsAlignedVGPRs()) {
17904 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17905 const Register Reg = Register::index2VirtReg(I);
17906 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17907 if (!RC)
17908 continue;
17909 int NewClassID = getAlignedAGPRClassID(RC->getID());
17910 if (NewClassID != -1)
17911 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17912 }
17913 }
17914
17916}
17917
17919 KnownBits &Known,
17920 const APInt &DemandedElts,
17921 const SelectionDAG &DAG,
17922 unsigned Depth) const {
17923 Known.resetAll();
17924 unsigned Opc = Op.getOpcode();
17925 switch (Opc) {
17927 unsigned IID = Op.getConstantOperandVal(0);
17928 switch (IID) {
17929 case Intrinsic::amdgcn_mbcnt_lo:
17930 case Intrinsic::amdgcn_mbcnt_hi: {
17931 const GCNSubtarget &ST =
17933 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17934 // most 31 + src1.
17935 Known.Zero.setBitsFrom(
17936 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17937 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17938 Known = KnownBits::add(Known, Known2);
17939 return;
17940 }
17941 }
17942 break;
17943 }
17944 }
17946 Op, Known, DemandedElts, DAG, Depth);
17947}
17948
17950 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17952
17953 // Set the high bits to zero based on the maximum allowed scratch size per
17954 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17955 // calculation won't overflow, so assume the sign bit is never set.
17956 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17957}
17958
17960 GISelValueTracking &VT, KnownBits &Known,
17961 unsigned Dim) {
17962 unsigned MaxValue =
17963 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17964 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17965}
17966
17968 KnownBits &Known, const APInt &DemandedElts,
17969 unsigned BFEWidth, bool SExt, unsigned Depth) {
17971 const MachineOperand &Src1 = MI.getOperand(2);
17972
17973 unsigned Src1Cst = 0;
17974 if (Src1.isImm()) {
17975 Src1Cst = Src1.getImm();
17976 } else if (Src1.isReg()) {
17977 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17978 if (!Cst)
17979 return;
17980 Src1Cst = Cst->Value.getZExtValue();
17981 } else {
17982 return;
17983 }
17984
17985 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17986 // Width is always [22:16].
17987 const unsigned Offset =
17988 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17989 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17990
17991 if (Width >= BFEWidth) // Ill-formed.
17992 return;
17993
17994 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17995 Depth + 1);
17996
17997 Known = Known.extractBits(Width, Offset);
17998
17999 if (SExt)
18000 Known = Known.sext(BFEWidth);
18001 else
18002 Known = Known.zext(BFEWidth);
18003}
18004
18006 GISelValueTracking &VT, Register R, KnownBits &Known,
18007 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
18008 unsigned Depth) const {
18009 Known.resetAll();
18010 const MachineInstr *MI = MRI.getVRegDef(R);
18011 switch (MI->getOpcode()) {
18012 case AMDGPU::S_BFE_I32:
18013 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18014 /*SExt=*/true, Depth);
18015 case AMDGPU::S_BFE_U32:
18016 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18017 /*SExt=*/false, Depth);
18018 case AMDGPU::S_BFE_I64:
18019 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18020 /*SExt=*/true, Depth);
18021 case AMDGPU::S_BFE_U64:
18022 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18023 /*SExt=*/false, Depth);
18024 case AMDGPU::G_INTRINSIC:
18025 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18026 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18027 switch (IID) {
18028 case Intrinsic::amdgcn_workitem_id_x:
18029 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18030 break;
18031 case Intrinsic::amdgcn_workitem_id_y:
18032 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18033 break;
18034 case Intrinsic::amdgcn_workitem_id_z:
18035 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18036 break;
18037 case Intrinsic::amdgcn_mbcnt_lo:
18038 case Intrinsic::amdgcn_mbcnt_hi: {
18039 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18040 // most 31 + src1.
18041 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18042 ? getSubtarget()->getWavefrontSizeLog2()
18043 : 5);
18044 KnownBits Known2;
18045 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18046 Depth + 1);
18047 Known = KnownBits::add(Known, Known2);
18048 break;
18049 }
18050 case Intrinsic::amdgcn_groupstaticsize: {
18051 // We can report everything over the maximum size as 0. We can't report
18052 // based on the actual size because we don't know if it's accurate or not
18053 // at any given point.
18054 Known.Zero.setHighBits(
18055 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18056 break;
18057 }
18058 }
18059 break;
18060 }
18061 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18062 Known.Zero.setHighBits(24);
18063 break;
18064 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18065 Known.Zero.setHighBits(16);
18066 break;
18067 case AMDGPU::G_AMDGPU_SMED3:
18068 case AMDGPU::G_AMDGPU_UMED3: {
18069 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18070
18071 KnownBits Known2;
18072 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18073 if (Known2.isUnknown())
18074 break;
18075
18076 KnownBits Known1;
18077 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18078 if (Known1.isUnknown())
18079 break;
18080
18081 KnownBits Known0;
18082 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18083 if (Known0.isUnknown())
18084 break;
18085
18086 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18087 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18088 Known.One = Known0.One & Known1.One & Known2.One;
18089 break;
18090 }
18091 }
18092}
18093
18096 unsigned Depth) const {
18097 const MachineInstr *MI = MRI.getVRegDef(R);
18098 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18099 // FIXME: Can this move to generic code? What about the case where the call
18100 // site specifies a lower alignment?
18101 Intrinsic::ID IID = GI->getIntrinsicID();
18103 AttributeList Attrs =
18104 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18105 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18106 return *RetAlign;
18107 }
18108 return Align(1);
18109}
18110
18113 const Align CacheLineAlign = Align(64);
18114
18115 // Pre-GFX10 target did not benefit from loop alignment
18116 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18117 getSubtarget()->hasInstFwdPrefetchBug())
18118 return PrefAlign;
18119
18120 // On GFX10 I$ is 4 x 64 bytes cache lines.
18121 // By default prefetcher keeps one cache line behind and reads two ahead.
18122 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18123 // behind and one ahead.
18124 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18125 // If loop fits 64 bytes it always spans no more than two cache lines and
18126 // does not need an alignment.
18127 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18128 // Else if loop is less or equal 192 bytes we need two lines behind.
18129
18131 const MachineBasicBlock *Header = ML->getHeader();
18132 if (Header->getAlignment() != PrefAlign)
18133 return Header->getAlignment(); // Already processed.
18134
18135 unsigned LoopSize = 0;
18136 for (const MachineBasicBlock *MBB : ML->blocks()) {
18137 // If inner loop block is aligned assume in average half of the alignment
18138 // size to be added as nops.
18139 if (MBB != Header)
18140 LoopSize += MBB->getAlignment().value() / 2;
18141
18142 for (const MachineInstr &MI : *MBB) {
18143 LoopSize += TII->getInstSizeInBytes(MI);
18144 if (LoopSize > 192)
18145 return PrefAlign;
18146 }
18147 }
18148
18149 if (LoopSize <= 64)
18150 return PrefAlign;
18151
18152 if (LoopSize <= 128)
18153 return CacheLineAlign;
18154
18155 // If any of parent loops is surrounded by prefetch instructions do not
18156 // insert new for inner loop, which would reset parent's settings.
18157 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18158 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18159 auto I = Exit->getFirstNonDebugInstr();
18160 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18161 return CacheLineAlign;
18162 }
18163 }
18164
18165 MachineBasicBlock *Pre = ML->getLoopPreheader();
18166 MachineBasicBlock *Exit = ML->getExitBlock();
18167
18168 if (Pre && Exit) {
18169 auto PreTerm = Pre->getFirstTerminator();
18170 if (PreTerm == Pre->begin() ||
18171 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18172 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18173 .addImm(1); // prefetch 2 lines behind PC
18174
18175 auto ExitHead = Exit->getFirstNonDebugInstr();
18176 if (ExitHead == Exit->end() ||
18177 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18178 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18179 .addImm(2); // prefetch 1 line behind PC
18180 }
18181
18182 return CacheLineAlign;
18183}
18184
18186static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18187 assert(N->getOpcode() == ISD::CopyFromReg);
18188 do {
18189 // Follow the chain until we find an INLINEASM node.
18190 N = N->getOperand(0).getNode();
18191 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18192 return true;
18193 } while (N->getOpcode() == ISD::CopyFromReg);
18194 return false;
18195}
18196
18199 UniformityInfo *UA) const {
18200 switch (N->getOpcode()) {
18201 case ISD::CopyFromReg: {
18202 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18203 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18204 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18205 Register Reg = R->getReg();
18206
18207 // FIXME: Why does this need to consider isLiveIn?
18208 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18209 return !TRI->isSGPRReg(MRI, Reg);
18210
18211 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18212 return UA->isDivergent(V);
18213
18215 return !TRI->isSGPRReg(MRI, Reg);
18216 }
18217 case ISD::LOAD: {
18218 const LoadSDNode *L = cast<LoadSDNode>(N);
18219 unsigned AS = L->getAddressSpace();
18220 // A flat load may access private memory.
18222 }
18223 case ISD::CALLSEQ_END:
18224 return true;
18226 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18228 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18247 // Target-specific read-modify-write atomics are sources of divergence.
18248 return true;
18249 default:
18250 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18251 // Generic read-modify-write atomics are sources of divergence.
18252 return A->readMem() && A->writeMem();
18253 }
18254 return false;
18255 }
18256}
18257
18259 EVT VT) const {
18260 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18261 case MVT::f32:
18263 case MVT::f64:
18264 case MVT::f16:
18266 default:
18267 return false;
18268 }
18269}
18270
18272 LLT Ty, const MachineFunction &MF) const {
18273 switch (Ty.getScalarSizeInBits()) {
18274 case 32:
18275 return !denormalModeIsFlushAllF32(MF);
18276 case 64:
18277 case 16:
18278 return !denormalModeIsFlushAllF64F16(MF);
18279 default:
18280 return false;
18281 }
18282}
18283
18285 const APInt &DemandedElts,
18286 const SelectionDAG &DAG,
18287 bool SNaN,
18288 unsigned Depth) const {
18289 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18290 const MachineFunction &MF = DAG.getMachineFunction();
18292
18293 if (Info->getMode().DX10Clamp)
18294 return true; // Clamped to 0.
18295 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18296 }
18297
18299 DAG, SNaN, Depth);
18300}
18301
18302// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18303// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18305 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18306 return true;
18307
18309 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18310 if (DenormMode == DenormalMode::getPreserveSign())
18311 return true;
18312
18313 // TODO: Remove this.
18314 return RMW->getFunction()
18315 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18316 .getValueAsBool();
18317}
18318
18320 LLVMContext &Ctx = RMW->getContext();
18321 StringRef MemScope =
18322 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18323
18324 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18325 << "Hardware instruction generated for atomic "
18326 << RMW->getOperationName(RMW->getOperation())
18327 << " operation at memory scope " << MemScope;
18328}
18329
18330static bool isV2F16OrV2BF16(Type *Ty) {
18331 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18332 Type *EltTy = VT->getElementType();
18333 return VT->getNumElements() == 2 &&
18334 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18335 }
18336
18337 return false;
18338}
18339
18340static bool isV2F16(Type *Ty) {
18342 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18343}
18344
18345static bool isV2BF16(Type *Ty) {
18347 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18348}
18349
18350/// \return true if atomicrmw integer ops work for the type.
18351static bool isAtomicRMWLegalIntTy(Type *Ty) {
18352 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18353 unsigned BW = IT->getBitWidth();
18354 return BW == 32 || BW == 64;
18355 }
18356
18357 return false;
18358}
18359
18360/// \return true if this atomicrmw xchg type can be selected.
18361static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18362 Type *Ty = RMW->getType();
18363 if (isAtomicRMWLegalIntTy(Ty))
18364 return true;
18365
18366 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18367 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18368 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18369 return BW == 32 || BW == 64;
18370 }
18371
18372 if (Ty->isFloatTy() || Ty->isDoubleTy())
18373 return true;
18374
18376 return VT->getNumElements() == 2 &&
18377 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18378 }
18379
18380 return false;
18381}
18382
18383/// \returns true if it's valid to emit a native instruction for \p RMW, based
18384/// on the properties of the target memory.
18385static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18386 const AtomicRMWInst *RMW,
18387 bool HasSystemScope) {
18388 // The remote/fine-grained access logic is different from the integer
18389 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18390 // fine-grained access does not work, even for a device local allocation.
18391 //
18392 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18393 // allocations work.
18394 if (HasSystemScope) {
18396 RMW->hasMetadata("amdgpu.no.remote.memory"))
18397 return true;
18398 if (Subtarget.hasEmulatedSystemScopeAtomics())
18399 return true;
18401 return true;
18402
18403 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18404}
18405
18406/// \return Action to perform on AtomicRMWInsts for integer operations.
18413
18414/// Return if a flat address space atomicrmw can access private memory.
18416 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18417 return !MD ||
18419}
18420
18428
18431 unsigned AS = RMW->getPointerAddressSpace();
18432 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18434
18435 // 64-bit flat atomics that dynamically reside in private memory will silently
18436 // be dropped.
18437 //
18438 // Note that we will emit a new copy of the original atomic in the expansion,
18439 // which will be incrementally relegalized.
18440 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18441 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18442 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18445
18446 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18448 ORE.emit([=]() {
18449 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18450 });
18451 return Kind;
18452 };
18453
18454 auto SSID = RMW->getSyncScopeID();
18455 bool HasSystemScope =
18456 SSID == SyncScope::System ||
18457 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18458
18459 auto Op = RMW->getOperation();
18460 switch (Op) {
18462 // PCIe supports add and xchg for system atomics.
18463 return isAtomicRMWLegalXChgTy(RMW)
18466 case AtomicRMWInst::Add:
18467 // PCIe supports add and xchg for system atomics.
18469 case AtomicRMWInst::Sub:
18470 case AtomicRMWInst::And:
18471 case AtomicRMWInst::Or:
18472 case AtomicRMWInst::Xor:
18473 case AtomicRMWInst::Max:
18474 case AtomicRMWInst::Min:
18481 if (Subtarget->hasEmulatedSystemScopeAtomics())
18483
18484 // On most subtargets, for atomicrmw operations other than add/xchg,
18485 // whether or not the instructions will behave correctly depends on where
18486 // the address physically resides and what interconnect is used in the
18487 // system configuration. On some some targets the instruction will nop,
18488 // and in others synchronization will only occur at degraded device scope.
18489 //
18490 // If the allocation is known local to the device, the instructions should
18491 // work correctly.
18492 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18494
18495 // If fine-grained remote memory works at device scope, we don't need to
18496 // do anything.
18497 if (!HasSystemScope &&
18498 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18500
18501 // If we are targeting a remote allocated address, it depends what kind of
18502 // allocation the address belongs to.
18503 //
18504 // If the allocation is fine-grained (in host memory, or in PCIe peer
18505 // device memory), the operation will fail depending on the target.
18506 //
18507 // Note fine-grained host memory access does work on APUs or if XGMI is
18508 // used, but we do not know if we are targeting an APU or the system
18509 // configuration from the ISA version/target-cpu.
18510 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18512
18515 // Atomic sub/or/xor do not work over PCI express, but atomic add
18516 // does. InstCombine transforms these with 0 to or, so undo that.
18517 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18518 ConstVal && ConstVal->isNullValue())
18520 }
18521
18522 // If the allocation could be in remote, fine-grained memory, the rmw
18523 // instructions may fail. cmpxchg should work, so emit that. On some
18524 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18525 // even work, so you're out of luck anyway.
18526
18527 // In summary:
18528 //
18529 // Cases that may fail:
18530 // - fine-grained pinned host memory
18531 // - fine-grained migratable host memory
18532 // - fine-grained PCIe peer device
18533 //
18534 // Cases that should work, but may be treated overly conservatively.
18535 // - fine-grained host memory on an APU
18536 // - fine-grained XGMI peer device
18538 }
18539
18541 }
18542 case AtomicRMWInst::FAdd: {
18543 Type *Ty = RMW->getType();
18544
18545 // TODO: Handle REGION_ADDRESS
18546 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18547 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18548 // is fixed to round-to-nearest-even.
18549 //
18550 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18551 // round-to-nearest-even.
18552 //
18553 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18554 // suggests it is OK if the floating-point mode may not match the calling
18555 // thread.
18556 if (Ty->isFloatTy()) {
18557 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18559 }
18560
18561 if (Ty->isDoubleTy()) {
18562 // Ignores denormal mode, but we don't consider flushing mandatory.
18563 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18565 }
18566
18567 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18569
18571 }
18572
18573 // LDS atomics respect the denormal mode from the mode register.
18574 //
18575 // Traditionally f32 global/buffer memory atomics would unconditionally
18576 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18577 // flush.
18578 //
18579 // On targets with flat atomic fadd, denormals would flush depending on
18580 // whether the target address resides in LDS or global memory. We consider
18581 // this flat-maybe-flush as will-flush.
18582 if (Ty->isFloatTy() &&
18583 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18586
18587 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18588 // safe. The message phrasing also should be better.
18589 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18590 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18591 // gfx942, gfx12
18592 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18593 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18594 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18595 // gfx90a, gfx942, gfx12
18596 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18597 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18598
18599 // gfx942, gfx12
18600 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18601 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18602 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18603 // gfx90a, gfx942, gfx12
18604 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18605 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18606
18607 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18608 // buffer. gfx12 does have the buffer version.
18609 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18610 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18611 }
18612
18613 // global and flat atomic fadd f64: gfx90a, gfx942.
18614 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18615 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18616
18617 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18618 if (Ty->isFloatTy()) {
18619 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18620 // gfx11+.
18621 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18622 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18623 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18624 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18625 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18626 } else {
18627 // gfx908
18628 if (RMW->use_empty() &&
18629 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18630 isV2F16(Ty))
18631 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18632 }
18633 }
18634
18635 // flat atomic fadd f32: gfx942, gfx11+.
18636 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18637 if (Subtarget->hasFlatAtomicFaddF32Inst())
18638 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18639
18640 // If it is in flat address space, and the type is float, we will try to
18641 // expand it, if the target supports global and lds atomic fadd. The
18642 // reason we need that is, in the expansion, we emit the check of
18643 // address space. If it is in global address space, we emit the global
18644 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18645 // fadd.
18646 if (Subtarget->hasLDSFPAtomicAddF32()) {
18647 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18649 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18651 }
18652 }
18653 }
18654
18656 }
18658 case AtomicRMWInst::FMax: {
18659 Type *Ty = RMW->getType();
18660
18661 // LDS float and double fmin/fmax were always supported.
18662 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18663 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18665 }
18666
18667 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18668 // For flat and global cases:
18669 // float, double in gfx7. Manual claims denormal support.
18670 // Removed in gfx8.
18671 // float, double restored in gfx10.
18672 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18673 //
18674 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18675 // no f32.
18676 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18677 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18678 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18679 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18680 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18681 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18683 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18684 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18685 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18686 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18687 }
18688 }
18689
18691 }
18694 default:
18696 }
18697
18698 llvm_unreachable("covered atomicrmw op switch");
18699}
18700
18707
18714
18717 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18718 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18720
18721 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18723
18724 const DataLayout &DL = CmpX->getDataLayout();
18725
18726 Type *ValTy = CmpX->getNewValOperand()->getType();
18727
18728 // If a 64-bit flat atomic may alias private, we need to avoid using the
18729 // atomic in the private case.
18730 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18732}
18733
18734const TargetRegisterClass *
18735SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18737 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18738 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18739 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18740 : &AMDGPU::SReg_32RegClass;
18741 if (!TRI->isSGPRClass(RC) && !isDivergent)
18742 return TRI->getEquivalentSGPRClass(RC);
18743 if (TRI->isSGPRClass(RC) && isDivergent)
18744 return TRI->getEquivalentVGPRClass(RC);
18745
18746 return RC;
18747}
18748
18749// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18750// uniform values (as produced by the mask results of control flow intrinsics)
18751// used outside of divergent blocks. The phi users need to also be treated as
18752// always uniform.
18753//
18754// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18755static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18756 unsigned WaveSize) {
18757 // FIXME: We assume we never cast the mask results of a control flow
18758 // intrinsic.
18759 // Early exit if the type won't be consistent as a compile time hack.
18760 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18761 if (!IT || IT->getBitWidth() != WaveSize)
18762 return false;
18763
18764 if (!isa<Instruction>(V))
18765 return false;
18766 if (!Visited.insert(V).second)
18767 return false;
18768 bool Result = false;
18769 for (const auto *U : V->users()) {
18771 if (V == U->getOperand(1)) {
18772 switch (Intrinsic->getIntrinsicID()) {
18773 default:
18774 Result = false;
18775 break;
18776 case Intrinsic::amdgcn_if_break:
18777 case Intrinsic::amdgcn_if:
18778 case Intrinsic::amdgcn_else:
18779 Result = true;
18780 break;
18781 }
18782 }
18783 if (V == U->getOperand(0)) {
18784 switch (Intrinsic->getIntrinsicID()) {
18785 default:
18786 Result = false;
18787 break;
18788 case Intrinsic::amdgcn_end_cf:
18789 case Intrinsic::amdgcn_loop:
18790 Result = true;
18791 break;
18792 }
18793 }
18794 } else {
18795 Result = hasCFUser(U, Visited, WaveSize);
18796 }
18797 if (Result)
18798 break;
18799 }
18800 return Result;
18801}
18802
18804 const Value *V) const {
18805 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18806 if (CI->isInlineAsm()) {
18807 // FIXME: This cannot give a correct answer. This should only trigger in
18808 // the case where inline asm returns mixed SGPR and VGPR results, used
18809 // outside the defining block. We don't have a specific result to
18810 // consider, so this assumes if any value is SGPR, the overall register
18811 // also needs to be SGPR.
18812 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18814 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18815 for (auto &TC : TargetConstraints) {
18816 if (TC.Type == InlineAsm::isOutput) {
18818 const TargetRegisterClass *RC =
18819 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18820 TC.ConstraintVT)
18821 .second;
18822 if (RC && SIRI->isSGPRClass(RC))
18823 return true;
18824 }
18825 }
18826 }
18827 }
18829 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18830}
18831
18833 for (SDUse &Use : N->uses()) {
18835 if (getBasePtrIndex(M) == Use.getOperandNo())
18836 return true;
18837 }
18838 }
18839 return false;
18840}
18841
18843 SDValue N1) const {
18844 if (!N0.hasOneUse())
18845 return false;
18846 // Take care of the opportunity to keep N0 uniform
18847 if (N0->isDivergent() || !N1->isDivergent())
18848 return true;
18849 // Check if we have a good chance to form the memory access pattern with the
18850 // base and offset
18851 return (DAG.isBaseWithConstantOffset(N0) &&
18853}
18854
18856 Register N0, Register N1) const {
18857 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18858}
18859
18862 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18864 if (I.getMetadata("amdgpu.noclobber"))
18865 Flags |= MONoClobber;
18866 if (I.getMetadata("amdgpu.last.use"))
18867 Flags |= MOLastUse;
18868 return Flags;
18869}
18870
18872 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18873 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18874 if (User->getOpcode() != ISD::CopyToReg)
18875 return false;
18876 if (!Def->isMachineOpcode())
18877 return false;
18879 if (!MDef)
18880 return false;
18881
18882 unsigned ResNo = User->getOperand(Op).getResNo();
18883 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18884 return false;
18885 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18886 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18887 PhysReg = AMDGPU::SCC;
18888 const TargetRegisterClass *RC =
18889 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18890 Cost = RC->getCopyCost();
18891 return true;
18892 }
18893 return false;
18894}
18895
18897 Instruction *AI) const {
18898 // Given: atomicrmw fadd ptr %addr, float %val ordering
18899 //
18900 // With this expansion we produce the following code:
18901 // [...]
18902 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18903 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18904 //
18905 // atomicrmw.shared:
18906 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18907 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18908 // float %val ordering
18909 // br label %atomicrmw.phi
18910 //
18911 // atomicrmw.check.private:
18912 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18913 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18914 //
18915 // atomicrmw.private:
18916 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18917 // %loaded.private = load float, ptr addrspace(5) %cast.private
18918 // %val.new = fadd float %loaded.private, %val
18919 // store float %val.new, ptr addrspace(5) %cast.private
18920 // br label %atomicrmw.phi
18921 //
18922 // atomicrmw.global:
18923 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18924 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18925 // float %val ordering
18926 // br label %atomicrmw.phi
18927 //
18928 // atomicrmw.phi:
18929 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18930 // [ %loaded.private, %atomicrmw.private ],
18931 // [ %loaded.global, %atomicrmw.global ]
18932 // br label %atomicrmw.end
18933 //
18934 // atomicrmw.end:
18935 // [...]
18936 //
18937 //
18938 // For 64-bit atomics which may reside in private memory, we perform a simpler
18939 // version that only inserts the private check, and uses the flat operation.
18940
18941 IRBuilder<> Builder(AI);
18942 LLVMContext &Ctx = Builder.getContext();
18943
18944 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18945 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18947 Value *Addr = AI->getOperand(PtrOpIdx);
18948
18949 /// TODO: Only need to check private, then emit flat-known-not private (no
18950 /// need for shared block, or cast to global).
18952
18953 Align Alignment;
18954 if (RMW)
18955 Alignment = RMW->getAlign();
18956 else if (CX)
18957 Alignment = CX->getAlign();
18958 else
18959 llvm_unreachable("unhandled atomic operation");
18960
18961 // FullFlatEmulation is true if we need to issue the private, shared, and
18962 // global cases.
18963 //
18964 // If this is false, we are only dealing with the flat-targeting-private case,
18965 // where we only insert a check for private and still use the flat instruction
18966 // for global and shared.
18967
18968 bool FullFlatEmulation =
18969 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18970 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18971 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18972 RMW->getType()->isDoubleTy()));
18973
18974 // If the return value isn't used, do not introduce a false use in the phi.
18975 bool ReturnValueIsUsed = !AI->use_empty();
18976
18977 BasicBlock *BB = Builder.GetInsertBlock();
18978 Function *F = BB->getParent();
18979 BasicBlock *ExitBB =
18980 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18981 BasicBlock *SharedBB = nullptr;
18982
18983 BasicBlock *CheckPrivateBB = BB;
18984 if (FullFlatEmulation) {
18985 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18986 CheckPrivateBB =
18987 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18988 }
18989
18990 BasicBlock *PrivateBB =
18991 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18992 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18993 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18994
18995 std::prev(BB->end())->eraseFromParent();
18996 Builder.SetInsertPoint(BB);
18997
18998 Value *LoadedShared = nullptr;
18999 if (FullFlatEmulation) {
19000 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19001 {Addr}, nullptr, "is.shared");
19002 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19003 Builder.SetInsertPoint(SharedBB);
19004 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19006
19007 Instruction *Clone = AI->clone();
19008 Clone->insertInto(SharedBB, SharedBB->end());
19009 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19010 LoadedShared = Clone;
19011
19012 Builder.CreateBr(PhiBB);
19013 Builder.SetInsertPoint(CheckPrivateBB);
19014 }
19015
19016 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19017 {Addr}, nullptr, "is.private");
19018 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19019
19020 Builder.SetInsertPoint(PrivateBB);
19021
19022 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19024
19025 Value *LoadedPrivate;
19026 if (RMW) {
19027 LoadedPrivate = Builder.CreateAlignedLoad(
19028 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19029
19030 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19031 LoadedPrivate, RMW->getValOperand());
19032
19033 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19034 } else {
19035 auto [ResultLoad, Equal] =
19036 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19037 CX->getNewValOperand(), CX->getAlign());
19038
19039 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19040 ResultLoad, 0);
19041 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19042 }
19043
19044 Builder.CreateBr(PhiBB);
19045
19046 Builder.SetInsertPoint(GlobalBB);
19047
19048 // Continue using a flat instruction if we only emitted the check for private.
19049 Instruction *LoadedGlobal = AI;
19050 if (FullFlatEmulation) {
19051 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19053 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19054 }
19055
19056 AI->removeFromParent();
19057 AI->insertInto(GlobalBB, GlobalBB->end());
19058
19059 // The new atomicrmw may go through another round of legalization later.
19060 if (!FullFlatEmulation) {
19061 // We inserted the runtime check already, make sure we do not try to
19062 // re-expand this.
19063 // TODO: Should union with any existing metadata.
19064 MDBuilder MDB(F->getContext());
19065 MDNode *RangeNotPrivate =
19068 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19069 RangeNotPrivate);
19070 }
19071
19072 Builder.CreateBr(PhiBB);
19073
19074 Builder.SetInsertPoint(PhiBB);
19075
19076 if (ReturnValueIsUsed) {
19077 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19078 AI->replaceAllUsesWith(Loaded);
19079 if (FullFlatEmulation)
19080 Loaded->addIncoming(LoadedShared, SharedBB);
19081 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19082 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19083 Loaded->takeName(AI);
19084 }
19085
19086 Builder.CreateBr(ExitBB);
19087}
19088
19090 unsigned PtrOpIdx) {
19091 Value *PtrOp = I->getOperand(PtrOpIdx);
19094
19095 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19096 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19097 I->getIterator());
19098 I->setOperand(PtrOpIdx, ASCast);
19099}
19100
19103
19106
19109 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19110 ConstVal && ConstVal->isNullValue()) {
19111 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19113
19114 // We may still need the private-alias-flat handling below.
19115
19116 // TODO: Skip this for cases where we cannot access remote memory.
19117 }
19118 }
19119
19120 // The non-flat expansions should only perform the de-canonicalization of
19121 // identity values.
19123 return;
19124
19126}
19127
19134
19138
19140 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19141}
19142
19144 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19145 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19146
19148 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19149}
19150
19151LoadInst *
19153 IRBuilder<> Builder(AI);
19154 auto Order = AI->getOrdering();
19155
19156 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19157 // must be flushed if the atomic ordering had a release semantics. This is
19158 // not necessary a fence, a release fence just coincides to do that flush.
19159 // Avoid replacing of an atomicrmw with a release semantics.
19160 if (isReleaseOrStronger(Order))
19161 return nullptr;
19162
19163 LoadInst *LI = Builder.CreateAlignedLoad(
19164 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19165 LI->setAtomic(Order, AI->getSyncScopeID());
19166 LI->copyMetadata(*AI);
19167 LI->takeName(AI);
19168 AI->replaceAllUsesWith(LI);
19169 AI->eraseFromParent();
19170 return LI;
19171}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1254
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1251
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:370
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1441
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs