Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUTargetMachine.h"
19#include "GCNSubtarget.h"
22#include "SIRegisterInfo.h"
23#include "llvm/ADT/APInt.h"
25#include "llvm/ADT/Statistic.h"
39#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
43#include "llvm/IR/MDBuilder.h"
46#include "llvm/Support/ModRef.h"
48#include <optional>
49
50using namespace llvm;
51using namespace llvm::SDPatternMatch;
52
53#define DEBUG_TYPE "si-lower"
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58 DisableLoopAlignment("amdgpu-disable-loop-alignment",
59 cl::desc("Do not align and prefetch loops"),
60 cl::init(false));
61
63 "amdgpu-use-divergent-register-indexing", cl::Hidden,
64 cl::desc("Use indirect register addressing for divergent indexes"),
65 cl::init(false));
66
67// TODO: This option should be removed once we switch to always using PTRADD in
68// the SelectionDAG.
70 "amdgpu-use-sdag-ptradd", cl::Hidden,
71 cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the "
72 "SelectionDAG ISel"),
73 cl::init(false));
74
77 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
78}
79
82 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
83}
84
85static unsigned findFirstFreeSGPR(CCState &CCInfo) {
86 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
87 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
88 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
89 return AMDGPU::SGPR0 + Reg;
90 }
91 }
92 llvm_unreachable("Cannot allocate sgpr");
93}
94
96 const GCNSubtarget &STI)
97 : AMDGPUTargetLowering(TM, STI), Subtarget(&STI) {
98 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
99 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
100
101 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
102 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
103
104 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
105
106 const SIRegisterInfo *TRI = STI.getRegisterInfo();
107 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
108
109 addRegisterClass(MVT::f64, V64RegClass);
110 addRegisterClass(MVT::v2f32, V64RegClass);
111 addRegisterClass(MVT::Untyped, V64RegClass);
112
113 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
114 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
115
116 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
117 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
118
119 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
120 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
121
122 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
123 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
124
125 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
126 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
127
128 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
129 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
130
131 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
132 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
133
134 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
135 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
136
137 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
138 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
139
140 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
141 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
142
143 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
144 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
151
152 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
153 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
154
155 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
156 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
157
158 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
159 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
160
161 if (Subtarget->has16BitInsts()) {
162 if (Subtarget->useRealTrue16Insts()) {
163 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
164 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
165 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
166 } else {
167 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
170 }
171
172 // Unless there are also VOP3P operations, not operations are really legal.
173 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
174 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
175 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
176 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
177 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
178 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
179 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
180 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
181 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
182 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
183 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
184 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
185 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
186 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
187 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
188 }
189
190 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
191 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
192
193 computeRegisterProperties(Subtarget->getRegisterInfo());
194
195 // The boolean content concept here is too inflexible. Compares only ever
196 // really produce a 1-bit result. Any copy/extend from these will turn into a
197 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
198 // it's what most targets use.
201
202 // We need to custom lower vector stores from local memory
203 setOperationAction(ISD::LOAD,
204 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
205 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
206 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
207 MVT::i1, MVT::v32i32},
208 Custom);
209
210 setOperationAction(ISD::STORE,
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
217 if (isTypeLegal(MVT::bf16)) {
218 for (unsigned Opc :
220 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
221 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
222 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
223 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
224 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
225 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
226 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
227 ISD::SETCC}) {
228 // FIXME: The promoted to type shouldn't need to be explicit
229 setOperationAction(Opc, MVT::bf16, Promote);
230 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
231 }
232
234
236 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
237
238 setOperationAction(ISD::FABS, MVT::bf16, Legal);
239 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
241
242 // We only need to custom lower because we can't specify an action for bf16
243 // sources.
246 }
247
248 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
249 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
250 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
251 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
252 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
253 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
254 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
259 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
264
265 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
266 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
267 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
268 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
269 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
270 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
271 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
272
273 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
274
278 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
279
280 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
281
283 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
284
286 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
287 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
288
290 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
291 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
292 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
293 Expand);
295 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
296 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
297 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
298 Expand);
299
301 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
302 MVT::v3i16, MVT::v4i16, MVT::Other},
303 Custom);
304
305 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
306 setOperationAction(ISD::BR_CC,
307 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
308
310
312
314 Expand);
315
316#if 0
318#endif
319
320 // We only support LOAD/STORE and vector manipulation ops for vectors
321 // with > 4 elements.
322 for (MVT VT :
323 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
324 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
325 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
326 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
327 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
328 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
329 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
330 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
331 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
332 switch (Op) {
333 case ISD::LOAD:
334 case ISD::STORE:
336 case ISD::BITCAST:
337 case ISD::UNDEF:
341 case ISD::IS_FPCLASS:
342 break;
347 break;
348 default:
350 break;
351 }
352 }
353 }
354
355 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
356
357 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
358 // is expanded to avoid having two separate loops in case the index is a VGPR.
359
360 // Most operations are naturally 32-bit vector operations. We only support
361 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
362 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
364 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
365
367 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
368
370 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
371
373 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
374 }
375
376 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
378 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
379
381 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
382
384 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
385
387 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
388 }
389
390 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
392 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
393
395 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
396
398 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
399
401 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
402 }
403
404 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
406 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
407
409 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
410
412 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
413
415 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
416 }
417
418 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
420 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
421
423 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
424
426 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
427
429 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
430 }
431
433 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
434 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
435 Custom);
436
437 if (Subtarget->hasPkMovB32()) {
438 // TODO: 16-bit element vectors should be legal with even aligned elements.
439 // TODO: Can be legal with wider source types than the result with
440 // subregister extracts.
441 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
442 }
443
445 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
446 // instead lower to cndmask in SITargetLowering::LowerSELECT().
448 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
449 // alignbit.
450 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
451
452 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
453 Custom);
454
455 // Avoid stack access for these.
456 // TODO: Generalize to more vector types.
458 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
459 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
460 Custom);
461
462 // Deal with vec3 vector operations when widened to vec4.
464 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
465
466 // Deal with vec5/6/7 vector operations when widened to vec8.
468 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
469 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
470 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
471 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
472 Custom);
473
474 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
475 // and output demarshalling
476 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
477
478 // We can't return success/failure, only the old value,
479 // let LLVM add the comparison
480 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
481 Expand);
482
483 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
484
485 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
486
487 // FIXME: This should be narrowed to i32, but that only happens if i64 is
488 // illegal.
489 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
490 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
491
492 // On SI this is s_memtime and s_memrealtime on VI.
493 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
494
495 if (Subtarget->hasSMemRealTime() ||
496 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
497 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
498 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
499
500 if (Subtarget->has16BitInsts()) {
501 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
502 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
503 } else {
504 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
505 }
506
507 if (Subtarget->hasMadMacF32Insts())
509
510 if (!Subtarget->hasBFI())
511 // fcopysign can be done in a single instruction with BFI.
512 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
513
514 if (!Subtarget->hasBCNT(32))
516
517 if (!Subtarget->hasBCNT(64))
519
520 if (Subtarget->hasFFBH())
522
523 if (Subtarget->hasFFBL())
525
526 // We only really have 32-bit BFE instructions (and 16-bit on VI).
527 //
528 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
529 // effort to match them now. We want this to be false for i64 cases when the
530 // extraction isn't restricted to the upper or lower half. Ideally we would
531 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
532 // span the midpoint are probably relatively rare, so don't worry about them
533 // for now.
534 if (Subtarget->hasBFE())
536
537 // Clamp modifier on add/sub
538 if (Subtarget->hasIntClamp())
540
541 if (Subtarget->hasAddNoCarry())
542 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
543 Legal);
544
546 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
547 {MVT::f32, MVT::f64}, Custom);
548
549 // These are really only legal for ieee_mode functions. We should be avoiding
550 // them for functions that don't have ieee_mode enabled, so just say they are
551 // legal.
552 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
553 {MVT::f32, MVT::f64}, Legal);
554
555 if (Subtarget->haveRoundOpsF64())
556 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
557 Legal);
558 else
559 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
560 MVT::f64, Custom);
561
562 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
563 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
564 Legal);
565 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
566
567 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
569
570 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
571 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
572
573 // Custom lower these because we can't specify a rule based on an illegal
574 // source bf16.
575 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
576 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
577
578 if (Subtarget->has16BitInsts()) {
581 MVT::i16, Legal);
582
583 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
584
586 MVT::i16, Expand);
587
591 ISD::CTPOP},
592 MVT::i16, Promote);
593
594 setOperationAction(ISD::LOAD, MVT::i16, Custom);
595
596 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
597
598 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
599 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
600 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
601 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
602
606
608
609 // F16 - Constant Actions.
612
613 // F16 - Load/Store Actions.
614 setOperationAction(ISD::LOAD, MVT::f16, Promote);
615 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
616 setOperationAction(ISD::STORE, MVT::f16, Promote);
617 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
618
619 // BF16 - Load/Store Actions.
620 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
621 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
622 setOperationAction(ISD::STORE, MVT::bf16, Promote);
623 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
624
625 // F16 - VOP1 Actions.
627 ISD::FSIN, ISD::FROUND},
628 MVT::f16, Custom);
629
630 // BF16 - VOP1 Actions.
631 if (Subtarget->hasBF16TransInsts())
632 setOperationAction({ISD::FCOS, ISD::FSIN, ISD::FDIV}, MVT::bf16, Custom);
633
636
637 // F16 - VOP2 Actions.
638 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
639 Expand);
640 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
641 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
643
644 // F16 - VOP3 Actions.
646 if (STI.hasMadF16())
648
649 for (MVT VT :
650 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
651 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
652 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
653 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
654 switch (Op) {
655 case ISD::LOAD:
656 case ISD::STORE:
658 case ISD::BITCAST:
659 case ISD::UNDEF:
664 case ISD::IS_FPCLASS:
665 break;
669 break;
670 default:
672 break;
673 }
674 }
675 }
676
677 // v_perm_b32 can handle either of these.
678 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
680
681 // XXX - Do these do anything? Vector constants turn into build_vector.
682 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
683
684 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
685 Legal);
686
687 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
688 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
689 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
690 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
691
692 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
694 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
695 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
696
697 setOperationAction(ISD::AND, MVT::v2i16, Promote);
698 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
699 setOperationAction(ISD::OR, MVT::v2i16, Promote);
700 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
701 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
702 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
703
704 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
705 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
706 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
707 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
708 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
709 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
710
711 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
712 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
713 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
714 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
715 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
716 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
717
718 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
719 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
720 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
721 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
722 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
723 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
724
725 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
726 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
727 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
728 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
729
730 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
732 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
734 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
735 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
736
737 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
738 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
739 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
740 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
741 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
742 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
743
744 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
745 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
746 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
747 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
748 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
749 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
750
751 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
752 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
753 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
754 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
755 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
756 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
757
758 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
759 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
760 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
761 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
762 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
763 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
764
766 MVT::v2i32, Expand);
767 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
768
770 MVT::v4i32, Expand);
771
773 MVT::v8i32, Expand);
774
775 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
776 Subtarget->hasVOP3PInsts() ? Legal : Custom);
777
778 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
779 // This isn't really legal, but this avoids the legalizer unrolling it (and
780 // allows matching fneg (fabs x) patterns)
781 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
782
783 // Can do this in one BFI plus a constant materialize.
785 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
786 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
787 MVT::v32f16, MVT::v32bf16},
788 Custom);
789
791 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
792 MVT::f16, Custom);
793 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
794
795 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
796 ISD::FMAXIMUMNUM},
797 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
798 Custom);
799
800 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
801 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
802 Expand);
803
804 for (MVT Vec16 :
805 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
806 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
809 Vec16, Custom);
811 }
812 }
813
814 if (Subtarget->hasVOP3PInsts()) {
818 MVT::v2i16, Legal);
819
820 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
821 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
822 MVT::v2f16, Legal);
823
825 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
826
828 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
829 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
830 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
831 Custom);
832
833 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
834 // Split vector operations.
839 VT, Custom);
840
841 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
842 // Split vector operations.
844 VT, Custom);
845
847 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
848 {MVT::v2f16, MVT::v4f16}, Custom);
849
850 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
851 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
852 Custom);
853
854 if (Subtarget->hasPackedFP32Ops()) {
856 MVT::v2f32, Legal);
858 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
859 Custom);
860 }
861 }
862
863 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
864
865 if (Subtarget->has16BitInsts()) {
867 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
869 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
870 } else {
871 // Legalization hack.
872 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
873
874 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
875 }
876
878 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
879 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
880 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
881 MVT::v32f16, MVT::v32bf16},
882 Custom);
883
885
886 if (Subtarget->hasVectorMulU64())
888 else if (Subtarget->hasScalarSMulU64())
890
891 if (Subtarget->hasMad64_32())
893
894 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
896
897 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
899 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
900 } else {
901 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
902 if (Subtarget->hasMinimum3Maximum3F32())
903 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
904
905 if (Subtarget->hasMinimum3Maximum3PKF16()) {
906 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
907
908 // If only the vector form is available, we need to widen to a vector.
909 if (!Subtarget->hasMinimum3Maximum3F16())
910 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
911 }
912 }
913
914 if (Subtarget->hasVOP3PInsts()) {
915 // We want to break these into v2f16 pieces, not scalarize.
916 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
917 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
918 Custom);
919 }
920
921 if (Subtarget->hasIntMinMax64())
923 Legal);
924
926 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
927 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
928 MVT::i8},
929 Custom);
930
932 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
933 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
934 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
935 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
936 Custom);
937
939 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
940 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
941 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
942 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
943 Custom);
944
945 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
947 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
948 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
949 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
950
951 // TODO: Could move this to custom lowering, could benefit from combines on
952 // extract of relevant bits.
953 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
954
956
957 if (Subtarget->hasBF16ConversionInsts()) {
958 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
960 }
961
962 if (Subtarget->hasBF16PackedInsts()) {
964 {ISD::FADD, ISD::FMUL, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMA},
965 MVT::v2bf16, Legal);
966 }
967
968 if (Subtarget->hasBF16TransInsts()) {
969 setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal);
970 }
971
972 if (Subtarget->hasCvtPkF16F32Inst()) {
974 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
975 Custom);
976 }
977
979 ISD::PTRADD,
981 ISD::SUB,
983 ISD::MUL,
984 ISD::FADD,
985 ISD::FSUB,
986 ISD::FDIV,
987 ISD::FMUL,
988 ISD::FMINNUM,
989 ISD::FMAXNUM,
990 ISD::FMINNUM_IEEE,
991 ISD::FMAXNUM_IEEE,
992 ISD::FMINIMUM,
993 ISD::FMAXIMUM,
994 ISD::FMINIMUMNUM,
995 ISD::FMAXIMUMNUM,
996 ISD::FMA,
997 ISD::SMIN,
998 ISD::SMAX,
999 ISD::UMIN,
1000 ISD::UMAX,
1001 ISD::SETCC,
1003 ISD::SMIN,
1004 ISD::SMAX,
1005 ISD::UMIN,
1006 ISD::UMAX,
1007 ISD::AND,
1008 ISD::OR,
1009 ISD::XOR,
1010 ISD::SHL,
1011 ISD::SRL,
1012 ISD::SRA,
1013 ISD::FSHR,
1023
1024 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1026
1027 // All memory operations. Some folding on the pointer operand is done to help
1028 // matching the constant offsets in the addressing modes.
1029 setTargetDAGCombine({ISD::LOAD,
1030 ISD::STORE,
1031 ISD::ATOMIC_LOAD,
1032 ISD::ATOMIC_STORE,
1033 ISD::ATOMIC_CMP_SWAP,
1034 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_SWAP,
1036 ISD::ATOMIC_LOAD_ADD,
1037 ISD::ATOMIC_LOAD_SUB,
1038 ISD::ATOMIC_LOAD_AND,
1039 ISD::ATOMIC_LOAD_OR,
1040 ISD::ATOMIC_LOAD_XOR,
1041 ISD::ATOMIC_LOAD_NAND,
1042 ISD::ATOMIC_LOAD_MIN,
1043 ISD::ATOMIC_LOAD_MAX,
1044 ISD::ATOMIC_LOAD_UMIN,
1045 ISD::ATOMIC_LOAD_UMAX,
1046 ISD::ATOMIC_LOAD_FADD,
1047 ISD::ATOMIC_LOAD_FMIN,
1048 ISD::ATOMIC_LOAD_FMAX,
1049 ISD::ATOMIC_LOAD_UINC_WRAP,
1050 ISD::ATOMIC_LOAD_UDEC_WRAP,
1053
1054 // FIXME: In other contexts we pretend this is a per-function property.
1056
1058}
1059
1060const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1061
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1064 return RCRegs;
1065}
1066
1067//===----------------------------------------------------------------------===//
1068// TargetLowering queries
1069//===----------------------------------------------------------------------===//
1070
1071// v_mad_mix* support a conversion from f16 to f32.
1072//
1073// There is only one special case when denormals are enabled we don't currently,
1074// where this is OK to use.
1075bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1076 EVT DestVT, EVT SrcVT) const {
1077 return DestVT.getScalarType() == MVT::f32 &&
1078 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 SrcVT.getScalarType() == MVT::f16) ||
1081 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1082 SrcVT.getScalarType() == MVT::bf16)) &&
1083 // TODO: This probably only requires no input flushing?
1085}
1086
1088 LLT DestTy, LLT SrcTy) const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 DestTy.getScalarSizeInBits() == 32 &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1093 // TODO: This probably only requires no input flushing?
1094 denormalModeIsFlushAllF32(*MI.getMF());
1095}
1096
1098 // SI has some legal vector types, but no legal vector operations. Say no
1099 // shuffles are legal in order to prefer scalarizing some vector operations.
1100 return false;
1101}
1102
1104 CallingConv::ID CC,
1105 EVT VT) const {
1107 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1108
1109 if (VT.isVector()) {
1110 EVT ScalarVT = VT.getScalarType();
1111 unsigned Size = ScalarVT.getSizeInBits();
1112 if (Size == 16) {
1113 if (Subtarget->has16BitInsts()) {
1114 if (VT.isInteger())
1115 return MVT::v2i16;
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 }
1118 return VT.isInteger() ? MVT::i32 : MVT::f32;
1119 }
1120
1121 if (Size < 16)
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1123 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1124 }
1125
1126 if (VT.getSizeInBits() > 32)
1127 return MVT::i32;
1128
1129 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1130}
1131
1133 CallingConv::ID CC,
1134 EVT VT) const {
1136 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1137
1138 if (VT.isVector()) {
1139 unsigned NumElts = VT.getVectorNumElements();
1140 EVT ScalarVT = VT.getScalarType();
1141 unsigned Size = ScalarVT.getSizeInBits();
1142
1143 // FIXME: Should probably promote 8-bit vectors to i16.
1144 if (Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1146
1147 if (Size <= 32)
1148 return NumElts;
1149
1150 if (Size > 32)
1151 return NumElts * ((Size + 31) / 32);
1152 } else if (VT.getSizeInBits() > 32)
1153 return (VT.getSizeInBits() + 31) / 32;
1154
1155 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1156}
1157
1159 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1160 unsigned &NumIntermediates, MVT &RegisterVT) const {
1161 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1162 unsigned NumElts = VT.getVectorNumElements();
1163 EVT ScalarVT = VT.getScalarType();
1164 unsigned Size = ScalarVT.getSizeInBits();
1165 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1166 // support, but unless we can properly handle 3-vectors, it will be still be
1167 // inconsistent.
1168 if (Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1172 } else {
1173 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1175 }
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1178 }
1179
1180 if (Size == 32) {
1181 RegisterVT = ScalarVT.getSimpleVT();
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1185 }
1186
1187 if (Size < 16 && Subtarget->has16BitInsts()) {
1188 // FIXME: Should probably form v2i16 pieces
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1193 }
1194
1195 if (Size != 16 && Size <= 32) {
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1200 }
1201
1202 if (Size > 32) {
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((Size + 31) / 32);
1206 return NumIntermediates;
1207 }
1208 }
1209
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1212}
1213
1215 const DataLayout &DL, Type *Ty,
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1218
1219 LLVMContext &Ctx = Ty->getContext();
1220 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1222 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1223 NumElts);
1224 }
1225
1226 return TLI.getValueType(DL, Ty);
1227}
1228
1229// Peek through TFE struct returns to only use the data size.
1231 const DataLayout &DL, Type *Ty,
1232 unsigned MaxNumLanes) {
1233 auto *ST = dyn_cast<StructType>(Ty);
1234 if (!ST)
1235 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1236
1237 // TFE intrinsics return an aggregate type.
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1240 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1241}
1242
1243/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1244/// in-memory representation. This return value is a custom type because there
1245/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1246/// could cause issues during codegen, these address space 7 pointers will be
1247/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1248/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1249/// for cost modeling, to work. (This also sets us up decently for doing the
1250/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1252 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1258}
1259/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1260/// v8i32 when padding is added.
1261/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1262/// also v8i32 with padding.
1264 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1268 return MVT::v8i32;
1270}
1271
1272static unsigned getIntrMemWidth(unsigned IntrID) {
1273 switch (IntrID) {
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 return 8;
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 return 32;
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 return 64;
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1295 return 128;
1296 default:
1297 llvm_unreachable("Unknown width");
1298 }
1299}
1300
1301static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad,
1303 Value *OrderingArg = CI.getArgOperand(IsLoad ? 1 : 2);
1304 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1305 switch (AtomicOrderingCABI(Ord)) {
1308 break;
1311 break;
1314 break;
1315 default:
1317 break;
1318 }
1319
1320 Info.flags =
1322 Info.flags |= MOCooperative;
1323
1324 MDNode *ScopeMD = cast<MDNode>(
1325 cast<MetadataAsValue>(CI.getArgOperand(IsLoad ? 2 : 3))->getMetadata());
1326 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1327 Info.ssid = CI.getContext().getOrInsertSyncScopeID(Scope);
1328}
1329
1331 const CallInst &CI,
1332 MachineFunction &MF,
1333 unsigned IntrID) const {
1334 Info.flags = MachineMemOperand::MONone;
1335 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1336 Info.flags |= MachineMemOperand::MOInvariant;
1337 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1339 Info.flags |= getTargetMMOFlags(CI);
1340
1341 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1343 AttributeSet Attr =
1345 MemoryEffects ME = Attr.getMemoryEffects();
1346 if (ME.doesNotAccessMemory())
1347 return false;
1348
1349 // TODO: Should images get their own address space?
1350 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1351
1352 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1353 if (RsrcIntr->IsImage) {
1354 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1356 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1357 Info.align.reset();
1358 }
1359
1360 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1361 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1362 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1363 // We conservatively set the memory operand of a buffer intrinsic to the
1364 // base resource pointer, so that we can access alias information about
1365 // those pointers. Cases like "this points at the same value
1366 // but with a different offset" are handled in
1367 // areMemAccessesTriviallyDisjoint.
1368 Info.ptrVal = RsrcArg;
1369 }
1370
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1372 if (!IsSPrefetch) {
1373 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1374 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1375 Info.flags |= MachineMemOperand::MOVolatile;
1376 }
1377
1379 if (ME.onlyReadsMemory()) {
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1382
1383 if (!BaseOpcode->Gather4) {
1384 // If this isn't a gather, we may have excess loaded elements in the
1385 // IR type. Check the dmask for the real number of elements loaded.
1386 unsigned DMask =
1387 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1388 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1389 }
1390
1391 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1392 CI.getType(), MaxNumLanes);
1393 } else {
1394 Info.memVT =
1396 std::numeric_limits<unsigned>::max());
1397 }
1398
1399 // FIXME: What does alignment mean for an image?
1400 Info.opc = ISD::INTRINSIC_W_CHAIN;
1401 Info.flags |= MachineMemOperand::MOLoad;
1402 } else if (ME.onlyWritesMemory()) {
1403 Info.opc = ISD::INTRINSIC_VOID;
1404
1405 Type *DataTy = CI.getArgOperand(0)->getType();
1406 if (RsrcIntr->IsImage) {
1407 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1408 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1409 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1410 DMaskLanes);
1411 } else
1412 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1413
1414 Info.flags |= MachineMemOperand::MOStore;
1415 } else {
1416 // Atomic, NoReturn Sampler or prefetch
1417 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1419 Info.flags |=
1421
1422 if (!IsSPrefetch)
1423 Info.flags |= MachineMemOperand::MOStore;
1424
1425 switch (IntrID) {
1426 default:
1427 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1428 // Fake memory access type for no return sampler intrinsics
1429 Info.memVT = MVT::i32;
1430 } else {
1431 // XXX - Should this be volatile without known ordering?
1432 Info.flags |= MachineMemOperand::MOVolatile;
1433 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1434 }
1435 break;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1440 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1441 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1442 Info.ptrVal = CI.getArgOperand(1);
1443 return true;
1444 }
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1449 Info.memVT =
1451 std::numeric_limits<unsigned>::max());
1452 Info.flags &= ~MachineMemOperand::MOStore;
1453 return true;
1454 }
1455 }
1456 }
1457 return true;
1458 }
1459
1460 switch (IntrID) {
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1463 Info.opc = ISD::INTRINSIC_W_CHAIN;
1464 Info.memVT = MVT::getVT(CI.getType());
1465 Info.ptrVal = CI.getOperand(0);
1466 Info.align.reset();
1468
1469 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1470 if (!Vol->isZero())
1471 Info.flags |= MachineMemOperand::MOVolatile;
1472
1473 return true;
1474 }
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1477 Info.opc = ISD::INTRINSIC_W_CHAIN;
1478 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1479 Info.ptrVal = nullptr;
1480 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1482 return true;
1483 }
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1486 Info.opc = ISD::INTRINSIC_W_CHAIN;
1487 Info.memVT = MVT::getVT(CI.getType());
1488 Info.ptrVal = CI.getOperand(0);
1489 Info.align.reset();
1491
1492 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1493 if (!Vol->isZero())
1494 Info.flags |= MachineMemOperand::MOVolatile;
1495
1496 return true;
1497 }
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1503 Info.memVT = MVT::getVT(CI.getType());
1504 Info.ptrVal = CI.getOperand(0);
1505 Info.memVT = MVT::i64;
1506 Info.size = 8;
1507 Info.align.reset();
1509 return true;
1510 }
1511 case Intrinsic::amdgcn_global_atomic_csub: {
1512 Info.opc = ISD::INTRINSIC_W_CHAIN;
1513 Info.memVT = MVT::getVT(CI.getType());
1514 Info.ptrVal = CI.getOperand(0);
1515 Info.align.reset();
1518 return true;
1519 }
1520 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1522 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1523 Info.opc = ISD::INTRINSIC_W_CHAIN;
1524 Info.memVT =
1525 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1526 ? CI.getType()
1528 ->getElementType(0)); // XXX: what is correct VT?
1529
1530 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1531 Info.align.reset();
1532 Info.flags |=
1534 return true;
1535 }
1536 case Intrinsic::amdgcn_global_atomic_fmin_num:
1537 case Intrinsic::amdgcn_global_atomic_fmax_num:
1538 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1539 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1540 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1541 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1542 Info.opc = ISD::INTRINSIC_W_CHAIN;
1543 Info.memVT = MVT::getVT(CI.getType());
1544 Info.ptrVal = CI.getOperand(0);
1545 Info.align.reset();
1549 return true;
1550 }
1551 case Intrinsic::amdgcn_flat_load_monitor_b32:
1552 case Intrinsic::amdgcn_flat_load_monitor_b64:
1553 case Intrinsic::amdgcn_flat_load_monitor_b128:
1554 case Intrinsic::amdgcn_global_load_monitor_b32:
1555 case Intrinsic::amdgcn_global_load_monitor_b64:
1556 case Intrinsic::amdgcn_global_load_monitor_b128:
1557 case Intrinsic::amdgcn_cluster_load_b32:
1558 case Intrinsic::amdgcn_cluster_load_b64:
1559 case Intrinsic::amdgcn_cluster_load_b128:
1560 case Intrinsic::amdgcn_ds_load_tr6_b96:
1561 case Intrinsic::amdgcn_ds_load_tr4_b64:
1562 case Intrinsic::amdgcn_ds_load_tr8_b64:
1563 case Intrinsic::amdgcn_ds_load_tr16_b128:
1564 case Intrinsic::amdgcn_global_load_tr6_b96:
1565 case Intrinsic::amdgcn_global_load_tr4_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b64:
1567 case Intrinsic::amdgcn_global_load_tr_b128:
1568 case Intrinsic::amdgcn_ds_read_tr4_b64:
1569 case Intrinsic::amdgcn_ds_read_tr6_b96:
1570 case Intrinsic::amdgcn_ds_read_tr8_b64:
1571 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1572 Info.opc = ISD::INTRINSIC_W_CHAIN;
1573 Info.memVT = MVT::getVT(CI.getType());
1574 Info.ptrVal = CI.getOperand(0);
1575 Info.align.reset();
1576 Info.flags |= MachineMemOperand::MOLoad;
1577 return true;
1578 }
1579 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1582 Info.opc = ISD::INTRINSIC_W_CHAIN;
1583 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1584 Info.ptrVal = CI.getOperand(0);
1585 Info.align.reset();
1586 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/true, Info);
1587 return true;
1588 }
1589 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1591 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1592 Info.opc = ISD::INTRINSIC_VOID;
1593 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1594 Info.ptrVal = CI.getArgOperand(0);
1595 Info.align.reset();
1596 getCoopAtomicOperandsInfo(CI, /*IsLoad=*/false, Info);
1597 return true;
1598 }
1599 case Intrinsic::amdgcn_ds_gws_init:
1600 case Intrinsic::amdgcn_ds_gws_barrier:
1601 case Intrinsic::amdgcn_ds_gws_sema_v:
1602 case Intrinsic::amdgcn_ds_gws_sema_br:
1603 case Intrinsic::amdgcn_ds_gws_sema_p:
1604 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1605 Info.opc = ISD::INTRINSIC_VOID;
1606
1607 const GCNTargetMachine &TM =
1608 static_cast<const GCNTargetMachine &>(getTargetMachine());
1609
1611 Info.ptrVal = MFI->getGWSPSV(TM);
1612
1613 // This is an abstract access, but we need to specify a type and size.
1614 Info.memVT = MVT::i32;
1615 Info.size = 4;
1616 Info.align = Align(4);
1617
1618 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1619 Info.flags |= MachineMemOperand::MOLoad;
1620 else
1621 Info.flags |= MachineMemOperand::MOStore;
1622 return true;
1623 }
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1632 Info.opc = ISD::INTRINSIC_VOID;
1633 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1634 Info.ptrVal = CI.getArgOperand(1);
1636 return true;
1637 }
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1642 Info.opc = ISD::INTRINSIC_VOID;
1643 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1644 Info.ptrVal = CI.getArgOperand(0);
1646 return true;
1647 }
1648 case Intrinsic::amdgcn_load_to_lds:
1649 case Intrinsic::amdgcn_global_load_lds: {
1650 Info.opc = ISD::INTRINSIC_VOID;
1651 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1652 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1653 Info.ptrVal = CI.getArgOperand(1);
1655 return true;
1656 }
1657 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1658 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1659 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1661 Info.opc = ISD::INTRINSIC_W_CHAIN;
1662
1663 const GCNTargetMachine &TM =
1664 static_cast<const GCNTargetMachine &>(getTargetMachine());
1665
1667 Info.ptrVal = MFI->getGWSPSV(TM);
1668
1669 // This is an abstract access, but we need to specify a type and size.
1670 Info.memVT = MVT::i32;
1671 Info.size = 4;
1672 Info.align = Align(4);
1673
1675 return true;
1676 }
1677 case Intrinsic::amdgcn_s_prefetch_data:
1678 case Intrinsic::amdgcn_flat_prefetch:
1679 case Intrinsic::amdgcn_global_prefetch: {
1680 Info.opc = ISD::INTRINSIC_VOID;
1681 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1682 Info.ptrVal = CI.getArgOperand(0);
1683 Info.flags |= MachineMemOperand::MOLoad;
1684 return true;
1685 }
1686 default:
1687 return false;
1688 }
1689}
1690
1692 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1694 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1695 // The DAG's ValueType loses the addrspaces.
1696 // Add them as 2 extra Constant operands "from" and "to".
1697 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1698 unsigned DstAS = I.getType()->getPointerAddressSpace();
1699 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1700 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1701 break;
1702 }
1703 default:
1704 break;
1705 }
1706}
1707
1710 Type *&AccessTy) const {
1711 Value *Ptr = nullptr;
1712 switch (II->getIntrinsicID()) {
1713 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1714 case Intrinsic::amdgcn_cluster_load_b128:
1715 case Intrinsic::amdgcn_cluster_load_b64:
1716 case Intrinsic::amdgcn_cluster_load_b32:
1717 case Intrinsic::amdgcn_ds_append:
1718 case Intrinsic::amdgcn_ds_consume:
1719 case Intrinsic::amdgcn_ds_load_tr8_b64:
1720 case Intrinsic::amdgcn_ds_load_tr16_b128:
1721 case Intrinsic::amdgcn_ds_load_tr4_b64:
1722 case Intrinsic::amdgcn_ds_load_tr6_b96:
1723 case Intrinsic::amdgcn_ds_read_tr4_b64:
1724 case Intrinsic::amdgcn_ds_read_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr8_b64:
1726 case Intrinsic::amdgcn_ds_read_tr16_b64:
1727 case Intrinsic::amdgcn_ds_ordered_add:
1728 case Intrinsic::amdgcn_ds_ordered_swap:
1729 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1730 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1731 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1732 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1733 case Intrinsic::amdgcn_flat_load_monitor_b128:
1734 case Intrinsic::amdgcn_flat_load_monitor_b32:
1735 case Intrinsic::amdgcn_flat_load_monitor_b64:
1736 case Intrinsic::amdgcn_global_atomic_csub:
1737 case Intrinsic::amdgcn_global_atomic_fmax_num:
1738 case Intrinsic::amdgcn_global_atomic_fmin_num:
1739 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1740 case Intrinsic::amdgcn_global_load_monitor_b128:
1741 case Intrinsic::amdgcn_global_load_monitor_b32:
1742 case Intrinsic::amdgcn_global_load_monitor_b64:
1743 case Intrinsic::amdgcn_global_load_tr_b64:
1744 case Intrinsic::amdgcn_global_load_tr_b128:
1745 case Intrinsic::amdgcn_global_load_tr4_b64:
1746 case Intrinsic::amdgcn_global_load_tr6_b96:
1747 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1748 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1751 Ptr = II->getArgOperand(0);
1752 break;
1753 case Intrinsic::amdgcn_load_to_lds:
1754 case Intrinsic::amdgcn_global_load_lds:
1755 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1756 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1759 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1763 Ptr = II->getArgOperand(1);
1764 break;
1765 default:
1766 return false;
1767 }
1768 AccessTy = II->getType();
1769 Ops.push_back(Ptr);
1770 return true;
1771}
1772
1774 unsigned AddrSpace) const {
1775 if (!Subtarget->hasFlatInstOffsets()) {
1776 // Flat instructions do not have offsets, and only have the register
1777 // address.
1778 return AM.BaseOffs == 0 && AM.Scale == 0;
1779 }
1780
1781 decltype(SIInstrFlags::FLAT) FlatVariant =
1785
1786 return AM.Scale == 0 &&
1787 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1788 AM.BaseOffs, AddrSpace, FlatVariant));
1789}
1790
1792 if (Subtarget->hasFlatGlobalInsts())
1794
1795 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1796 // Assume the we will use FLAT for all global memory accesses
1797 // on VI.
1798 // FIXME: This assumption is currently wrong. On VI we still use
1799 // MUBUF instructions for the r + i addressing mode. As currently
1800 // implemented, the MUBUF instructions only work on buffer < 4GB.
1801 // It may be possible to support > 4GB buffers with MUBUF instructions,
1802 // by setting the stride value in the resource descriptor which would
1803 // increase the size limit to (stride * 4GB). However, this is risky,
1804 // because it has never been validated.
1806 }
1807
1808 return isLegalMUBUFAddressingMode(AM);
1809}
1810
1811bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1812 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1813 // additionally can do r + r + i with addr64. 32-bit has more addressing
1814 // mode options. Depending on the resource constant, it can also do
1815 // (i64 r0) + (i32 r1) * (i14 i).
1816 //
1817 // Private arrays end up using a scratch buffer most of the time, so also
1818 // assume those use MUBUF instructions. Scratch loads / stores are currently
1819 // implemented as mubuf instructions with offen bit set, so slightly
1820 // different than the normal addr64.
1821 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1822 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1823 return false;
1824
1825 // FIXME: Since we can split immediate into soffset and immediate offset,
1826 // would it make sense to allow any immediate?
1827
1828 switch (AM.Scale) {
1829 case 0: // r + i or just i, depending on HasBaseReg.
1830 return true;
1831 case 1:
1832 return true; // We have r + r or r + i.
1833 case 2:
1834 if (AM.HasBaseReg) {
1835 // Reject 2 * r + r.
1836 return false;
1837 }
1838
1839 // Allow 2 * r as r + r
1840 // Or 2 * r + i is allowed as r + r + i.
1841 return true;
1842 default: // Don't allow n * r
1843 return false;
1844 }
1845}
1846
1848 const AddrMode &AM, Type *Ty,
1849 unsigned AS,
1850 Instruction *I) const {
1851 // No global is ever allowed as a base.
1852 if (AM.BaseGV)
1853 return false;
1854
1855 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1856 return isLegalGlobalAddressingMode(AM);
1857
1858 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1862 // If the offset isn't a multiple of 4, it probably isn't going to be
1863 // correctly aligned.
1864 // FIXME: Can we get the real alignment here?
1865 if (AM.BaseOffs % 4 != 0)
1866 return isLegalMUBUFAddressingMode(AM);
1867
1868 if (!Subtarget->hasScalarSubwordLoads()) {
1869 // There are no SMRD extloads, so if we have to do a small type access we
1870 // will use a MUBUF load.
1871 // FIXME?: We also need to do this if unaligned, but we don't know the
1872 // alignment here.
1873 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1874 return isLegalGlobalAddressingMode(AM);
1875 }
1876
1877 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1878 // SMRD instructions have an 8-bit, dword offset on SI.
1879 if (!isUInt<8>(AM.BaseOffs / 4))
1880 return false;
1881 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1882 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1883 // in 8-bits, it can use a smaller encoding.
1884 if (!isUInt<32>(AM.BaseOffs / 4))
1885 return false;
1886 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1887 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1888 if (!isUInt<20>(AM.BaseOffs))
1889 return false;
1890 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1891 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1892 // for S_BUFFER_* instructions).
1893 if (!isInt<21>(AM.BaseOffs))
1894 return false;
1895 } else {
1896 // On GFX12, all offsets are signed 24-bit in bytes.
1897 if (!isInt<24>(AM.BaseOffs))
1898 return false;
1899 }
1900
1901 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1903 AM.BaseOffs < 0) {
1904 // Scalar (non-buffer) loads can only use a negative offset if
1905 // soffset+offset is non-negative. Since the compiler can only prove that
1906 // in a few special cases, it is safer to claim that negative offsets are
1907 // not supported.
1908 return false;
1909 }
1910
1911 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1912 return true;
1913
1914 if (AM.Scale == 1 && AM.HasBaseReg)
1915 return true;
1916
1917 return false;
1918 }
1919
1920 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1921 return Subtarget->enableFlatScratch()
1923 : isLegalMUBUFAddressingMode(AM);
1924
1925 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1926 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1927 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1928 // field.
1929 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1930 // an 8-bit dword offset but we don't know the alignment here.
1931 if (!isUInt<16>(AM.BaseOffs))
1932 return false;
1933
1934 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1935 return true;
1936
1937 if (AM.Scale == 1 && AM.HasBaseReg)
1938 return true;
1939
1940 return false;
1941 }
1942
1944 // For an unknown address space, this usually means that this is for some
1945 // reason being used for pure arithmetic, and not based on some addressing
1946 // computation. We don't have instructions that compute pointers with any
1947 // addressing modes, so treat them as having no offset like flat
1948 // instructions.
1950 }
1951
1952 // Assume a user alias of global for unknown address spaces.
1953 return isLegalGlobalAddressingMode(AM);
1954}
1955
1957 const MachineFunction &MF) const {
1959 return (MemVT.getSizeInBits() <= 4 * 32);
1960 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1961 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1962 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1963 }
1965 return (MemVT.getSizeInBits() <= 2 * 32);
1966 return true;
1967}
1968
1970 unsigned Size, unsigned AddrSpace, Align Alignment,
1971 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1972 if (IsFast)
1973 *IsFast = 0;
1974
1975 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1976 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1977 // Check if alignment requirements for ds_read/write instructions are
1978 // disabled.
1979 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1980 return false;
1981
1982 Align RequiredAlignment(
1983 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1984 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1985 Alignment < RequiredAlignment)
1986 return false;
1987
1988 // Either, the alignment requirements are "enabled", or there is an
1989 // unaligned LDS access related hardware bug though alignment requirements
1990 // are "disabled". In either case, we need to check for proper alignment
1991 // requirements.
1992 //
1993 switch (Size) {
1994 case 64:
1995 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1996 // address is negative, then the instruction is incorrectly treated as
1997 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1998 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1999 // load later in the SILoadStoreOptimizer.
2000 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2001 return false;
2002
2003 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2004 // can do a 4 byte aligned, 8 byte access in a single operation using
2005 // ds_read2/write2_b32 with adjacent offsets.
2006 RequiredAlignment = Align(4);
2007
2008 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2009 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2010 // ds_write2_b32 depending on the alignment. In either case with either
2011 // alignment there is no faster way of doing this.
2012
2013 // The numbers returned here and below are not additive, it is a 'speed
2014 // rank'. They are just meant to be compared to decide if a certain way
2015 // of lowering an operation is faster than another. For that purpose
2016 // naturally aligned operation gets it bitsize to indicate that "it
2017 // operates with a speed comparable to N-bit wide load". With the full
2018 // alignment ds128 is slower than ds96 for example. If underaligned it
2019 // is comparable to a speed of a single dword access, which would then
2020 // mean 32 < 128 and it is faster to issue a wide load regardless.
2021 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2022 // wider load which will not be aligned anymore the latter is slower.
2023 if (IsFast)
2024 *IsFast = (Alignment >= RequiredAlignment) ? 64
2025 : (Alignment < Align(4)) ? 32
2026 : 1;
2027 return true;
2028 }
2029
2030 break;
2031 case 96:
2032 if (!Subtarget->hasDS96AndDS128())
2033 return false;
2034
2035 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2036 // gfx8 and older.
2037
2038 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2039 // Naturally aligned access is fastest. However, also report it is Fast
2040 // if memory is aligned less than DWORD. A narrow load or store will be
2041 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2042 // be more of them, so overall we will pay less penalty issuing a single
2043 // instruction.
2044
2045 // See comment on the values above.
2046 if (IsFast)
2047 *IsFast = (Alignment >= RequiredAlignment) ? 96
2048 : (Alignment < Align(4)) ? 32
2049 : 1;
2050 return true;
2051 }
2052
2053 break;
2054 case 128:
2055 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2056 return false;
2057
2058 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2059 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2060 // single operation using ds_read2/write2_b64.
2061 RequiredAlignment = Align(8);
2062
2063 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2064 // Naturally aligned access is fastest. However, also report it is Fast
2065 // if memory is aligned less than DWORD. A narrow load or store will be
2066 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2067 // will be more of them, so overall we will pay less penalty issuing a
2068 // single instruction.
2069
2070 // See comment on the values above.
2071 if (IsFast)
2072 *IsFast = (Alignment >= RequiredAlignment) ? 128
2073 : (Alignment < Align(4)) ? 32
2074 : 1;
2075 return true;
2076 }
2077
2078 break;
2079 default:
2080 if (Size > 32)
2081 return false;
2082
2083 break;
2084 }
2085
2086 // See comment on the values above.
2087 // Note that we have a single-dword or sub-dword here, so if underaligned
2088 // it is a slowest possible access, hence returned value is 0.
2089 if (IsFast)
2090 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2091
2092 return Alignment >= RequiredAlignment ||
2093 Subtarget->hasUnalignedDSAccessEnabled();
2094 }
2095
2096 // FIXME: We have to be conservative here and assume that flat operations
2097 // will access scratch. If we had access to the IR function, then we
2098 // could determine if any private memory was used in the function.
2099 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2100 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2101 bool AlignedBy4 = Alignment >= Align(4);
2102 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2103 if (IsFast)
2104 *IsFast = AlignedBy4 ? Size : 1;
2105 return true;
2106 }
2107
2108 if (IsFast)
2109 *IsFast = AlignedBy4;
2110
2111 return AlignedBy4;
2112 }
2113
2114 // So long as they are correct, wide global memory operations perform better
2115 // than multiple smaller memory ops -- even when misaligned
2116 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2117 if (IsFast)
2118 *IsFast = Size;
2119
2120 return Alignment >= Align(4) ||
2121 Subtarget->hasUnalignedBufferAccessEnabled();
2122 }
2123
2124 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2125 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2126 // out-of-bounds behavior, but in the edge case where an access starts
2127 // out-of-bounds and then enter in-bounds, the entire access would be treated
2128 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2129 // natural alignment of buffer accesses.
2130 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2131 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2132 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2133 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2134 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2135 return false;
2136 }
2137
2138 // Smaller than dword value must be aligned.
2139 if (Size < 32)
2140 return false;
2141
2142 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2143 // byte-address are ignored, thus forcing Dword alignment.
2144 // This applies to private, global, and constant memory.
2145 if (IsFast)
2146 *IsFast = 1;
2147
2148 return Size >= 32 && Alignment >= Align(4);
2149}
2150
2152 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2153 unsigned *IsFast) const {
2155 Alignment, Flags, IsFast);
2156}
2157
2159 LLVMContext &Context, const MemOp &Op,
2160 const AttributeList &FuncAttributes) const {
2161 // FIXME: Should account for address space here.
2162
2163 // The default fallback uses the private pointer size as a guess for a type to
2164 // use. Make sure we switch these to 64-bit accesses.
2165
2166 if (Op.size() >= 16 &&
2167 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2168 return MVT::v4i32;
2169
2170 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2171 return MVT::v2i32;
2172
2173 // Use the default.
2174 return MVT::Other;
2175}
2176
2178 const MemSDNode *MemNode = cast<MemSDNode>(N);
2179 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2180}
2181
2186
2188 unsigned DestAS) const {
2189 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2190 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2191 Subtarget->hasGloballyAddressableScratch()) {
2192 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2193 return false;
2194 }
2195
2196 // Flat -> private/local is a simple truncate.
2197 // Flat -> global is no-op
2198 return true;
2199 }
2200
2201 const GCNTargetMachine &TM =
2202 static_cast<const GCNTargetMachine &>(getTargetMachine());
2203 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2204}
2205
2213
2215 Type *Ty) const {
2216 // FIXME: Could be smarter if called for vector constants.
2217 return true;
2218}
2219
2221 unsigned Index) const {
2223 return false;
2224
2225 // TODO: Add more cases that are cheap.
2226 return Index == 0;
2227}
2228
2229bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2230 // TODO: This should be more aggressive, particular for 16-bit element
2231 // vectors. However there are some mixed improvements and regressions.
2232 EVT EltTy = VT.getVectorElementType();
2233 return EltTy.getSizeInBits() % 32 == 0;
2234}
2235
2237 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2238 switch (Op) {
2239 case ISD::LOAD:
2240 case ISD::STORE:
2241 return true;
2242 default:
2243 return false;
2244 }
2245 }
2246
2247 // SimplifySetCC uses this function to determine whether or not it should
2248 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2249 if (VT == MVT::i1 && Op == ISD::SETCC)
2250 return false;
2251
2253}
2254
2255SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2256 const SDLoc &SL,
2257 SDValue Chain,
2258 uint64_t Offset) const {
2259 const DataLayout &DL = DAG.getDataLayout();
2263
2264 auto [InputPtrReg, RC, ArgTy] =
2265 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2266
2267 // We may not have the kernarg segment argument if we have no kernel
2268 // arguments.
2269 if (!InputPtrReg)
2270 return DAG.getConstant(Offset, SL, PtrVT);
2271
2273 SDValue BasePtr = DAG.getCopyFromReg(
2274 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2275
2276 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2277}
2278
2279SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2280 const SDLoc &SL) const {
2283 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2284}
2285
2286SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2287 const SDLoc &SL) const {
2288
2290 std::optional<uint32_t> KnownSize =
2292 if (KnownSize.has_value())
2293 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2294 return SDValue();
2295}
2296
2297SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2298 const SDLoc &SL, SDValue Val,
2299 bool Signed,
2300 const ISD::InputArg *Arg) const {
2301 // First, if it is a widened vector, narrow it.
2302 if (VT.isVector() &&
2304 EVT NarrowedVT =
2307 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2308 DAG.getConstant(0, SL, MVT::i32));
2309 }
2310
2311 // Then convert the vector elements or scalar value.
2312 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2313 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2314 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2315 }
2316
2317 if (MemVT.isFloatingPoint())
2318 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2319 else if (Signed)
2320 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2321 else
2322 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2323
2324 return Val;
2325}
2326
2327SDValue SITargetLowering::lowerKernargMemParameter(
2328 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2329 uint64_t Offset, Align Alignment, bool Signed,
2330 const ISD::InputArg *Arg) const {
2331 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2332
2333 // Try to avoid using an extload by loading earlier than the argument address,
2334 // and extracting the relevant bits. The load should hopefully be merged with
2335 // the previous argument.
2336 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2337 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2338 int64_t AlignDownOffset = alignDown(Offset, 4);
2339 int64_t OffsetDiff = Offset - AlignDownOffset;
2340
2341 EVT IntVT = MemVT.changeTypeToInteger();
2342
2343 // TODO: If we passed in the base kernel offset we could have a better
2344 // alignment than 4, but we don't really need it.
2345 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2346 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2349
2350 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2351 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2352
2353 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2354 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2355 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2356
2357 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2358 }
2359
2360 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2361 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2364
2365 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2366 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2367}
2368
2369/// Coerce an argument which was passed in a different ABI type to the original
2370/// expected value type.
2371SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2372 SDValue Val,
2373 CCValAssign &VA,
2374 const SDLoc &SL) const {
2375 EVT ValVT = VA.getValVT();
2376
2377 // If this is an 8 or 16-bit value, it is really passed promoted
2378 // to 32 bits. Insert an assert[sz]ext to capture this, then
2379 // truncate to the right size.
2380 switch (VA.getLocInfo()) {
2381 case CCValAssign::Full:
2382 return Val;
2383 case CCValAssign::BCvt:
2384 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2385 case CCValAssign::SExt:
2386 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2387 DAG.getValueType(ValVT));
2388 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2389 case CCValAssign::ZExt:
2390 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2391 DAG.getValueType(ValVT));
2392 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2393 case CCValAssign::AExt:
2394 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2395 default:
2396 llvm_unreachable("Unknown loc info!");
2397 }
2398}
2399
2400SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2401 CCValAssign &VA, const SDLoc &SL,
2402 SDValue Chain,
2403 const ISD::InputArg &Arg) const {
2404 MachineFunction &MF = DAG.getMachineFunction();
2405 MachineFrameInfo &MFI = MF.getFrameInfo();
2406
2407 if (Arg.Flags.isByVal()) {
2408 unsigned Size = Arg.Flags.getByValSize();
2409 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2410 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2411 }
2412
2413 unsigned ArgOffset = VA.getLocMemOffset();
2414 unsigned ArgSize = VA.getValVT().getStoreSize();
2415
2416 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2417
2418 // Create load nodes to retrieve arguments from the stack.
2419 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2420
2421 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2423 MVT MemVT = VA.getValVT();
2424
2425 switch (VA.getLocInfo()) {
2426 default:
2427 break;
2428 case CCValAssign::BCvt:
2429 MemVT = VA.getLocVT();
2430 break;
2431 case CCValAssign::SExt:
2432 ExtType = ISD::SEXTLOAD;
2433 break;
2434 case CCValAssign::ZExt:
2435 ExtType = ISD::ZEXTLOAD;
2436 break;
2437 case CCValAssign::AExt:
2438 ExtType = ISD::EXTLOAD;
2439 break;
2440 }
2441
2442 SDValue ArgValue = DAG.getExtLoad(
2443 ExtType, SL, VA.getLocVT(), Chain, FIN,
2445
2446 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2447 if (ConvertedVal == ArgValue)
2448 return ConvertedVal;
2449
2450 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2451}
2452
2453SDValue SITargetLowering::lowerWorkGroupId(
2454 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2457 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2458 if (!Subtarget->hasClusters())
2459 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2460
2461 // Clusters are supported. Return the global position in the grid. If clusters
2462 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2463
2464 // WorkGroupIdXYZ = ClusterId == 0 ?
2465 // ClusterIdXYZ :
2466 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2467 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2468 SDLoc SL(ClusterIdXYZ);
2469 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2470 SDValue One = DAG.getConstant(1, SL, VT);
2471 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2472 SDValue ClusterWorkGroupIdXYZ =
2473 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2474 SDValue GlobalIdXYZ =
2475 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2476 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2477
2478 switch (MFI.getClusterDims().getKind()) {
2481 return GlobalIdXYZ;
2483 return ClusterIdXYZ;
2485 using namespace AMDGPU::Hwreg;
2486 SDValue ClusterIdField =
2487 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2488 SDNode *GetReg =
2489 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2490 SDValue ClusterId(GetReg, 0);
2491 SDValue Zero = DAG.getConstant(0, SL, VT);
2492 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2493 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2494 }
2495 }
2496
2497 llvm_unreachable("nothing should reach here");
2498}
2499
2500SDValue SITargetLowering::getPreloadedValue(
2501 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2503 const ArgDescriptor *Reg = nullptr;
2504 const TargetRegisterClass *RC;
2505 LLT Ty;
2506
2508 const ArgDescriptor WorkGroupIDX =
2509 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2510 // If GridZ is not programmed in an entry function then the hardware will set
2511 // it to all zeros, so there is no need to mask the GridY value in the low
2512 // order bits.
2513 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2514 AMDGPU::TTMP7,
2515 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2516 const ArgDescriptor WorkGroupIDZ =
2517 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2518 const ArgDescriptor ClusterWorkGroupIDX =
2519 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2520 const ArgDescriptor ClusterWorkGroupIDY =
2521 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2522 const ArgDescriptor ClusterWorkGroupIDZ =
2523 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2524 const ArgDescriptor ClusterWorkGroupMaxIDX =
2525 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2526 const ArgDescriptor ClusterWorkGroupMaxIDY =
2527 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2528 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2529 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2530 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2531 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2532
2533 auto LoadConstant = [&](unsigned N) {
2534 return DAG.getConstant(N, SDLoc(), VT);
2535 };
2536
2537 if (Subtarget->hasArchitectedSGPRs() &&
2539 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2540 bool HasFixedDims = ClusterDims.isFixedDims();
2541
2542 switch (PVID) {
2544 Reg = &WorkGroupIDX;
2545 RC = &AMDGPU::SReg_32RegClass;
2546 Ty = LLT::scalar(32);
2547 break;
2549 Reg = &WorkGroupIDY;
2550 RC = &AMDGPU::SReg_32RegClass;
2551 Ty = LLT::scalar(32);
2552 break;
2554 Reg = &WorkGroupIDZ;
2555 RC = &AMDGPU::SReg_32RegClass;
2556 Ty = LLT::scalar(32);
2557 break;
2559 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2560 return LoadConstant(0);
2561 Reg = &ClusterWorkGroupIDX;
2562 RC = &AMDGPU::SReg_32RegClass;
2563 Ty = LLT::scalar(32);
2564 break;
2566 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2567 return LoadConstant(0);
2568 Reg = &ClusterWorkGroupIDY;
2569 RC = &AMDGPU::SReg_32RegClass;
2570 Ty = LLT::scalar(32);
2571 break;
2573 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2574 return LoadConstant(0);
2575 Reg = &ClusterWorkGroupIDZ;
2576 RC = &AMDGPU::SReg_32RegClass;
2577 Ty = LLT::scalar(32);
2578 break;
2580 if (HasFixedDims)
2581 return LoadConstant(ClusterDims.getDims()[0] - 1);
2582 Reg = &ClusterWorkGroupMaxIDX;
2583 RC = &AMDGPU::SReg_32RegClass;
2584 Ty = LLT::scalar(32);
2585 break;
2587 if (HasFixedDims)
2588 return LoadConstant(ClusterDims.getDims()[1] - 1);
2589 Reg = &ClusterWorkGroupMaxIDY;
2590 RC = &AMDGPU::SReg_32RegClass;
2591 Ty = LLT::scalar(32);
2592 break;
2594 if (HasFixedDims)
2595 return LoadConstant(ClusterDims.getDims()[2] - 1);
2596 Reg = &ClusterWorkGroupMaxIDZ;
2597 RC = &AMDGPU::SReg_32RegClass;
2598 Ty = LLT::scalar(32);
2599 break;
2601 Reg = &ClusterWorkGroupMaxFlatID;
2602 RC = &AMDGPU::SReg_32RegClass;
2603 Ty = LLT::scalar(32);
2604 break;
2605 default:
2606 break;
2607 }
2608 }
2609
2610 if (!Reg)
2611 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2612 if (!Reg) {
2614 // It's possible for a kernarg intrinsic call to appear in a kernel with
2615 // no allocated segment, in which case we do not add the user sgpr
2616 // argument, so just return null.
2617 return DAG.getConstant(0, SDLoc(), VT);
2618 }
2619
2620 // It's undefined behavior if a function marked with the amdgpu-no-*
2621 // attributes uses the corresponding intrinsic.
2622 return DAG.getPOISON(VT);
2623 }
2624
2625 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2626}
2627
2629 CallingConv::ID CallConv,
2630 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2631 FunctionType *FType,
2633 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2634 const ISD::InputArg *Arg = &Ins[I];
2635
2636 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2637 "vector type argument should have been split");
2638
2639 // First check if it's a PS input addr.
2640 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2641 PSInputNum <= 15) {
2642 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2643
2644 // Inconveniently only the first part of the split is marked as isSplit,
2645 // so skip to the end. We only want to increment PSInputNum once for the
2646 // entire split argument.
2647 if (Arg->Flags.isSplit()) {
2648 while (!Arg->Flags.isSplitEnd()) {
2649 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2650 "unexpected vector split in ps argument type");
2651 if (!SkipArg)
2652 Splits.push_back(*Arg);
2653 Arg = &Ins[++I];
2654 }
2655 }
2656
2657 if (SkipArg) {
2658 // We can safely skip PS inputs.
2659 Skipped.set(Arg->getOrigArgIndex());
2660 ++PSInputNum;
2661 continue;
2662 }
2663
2664 Info->markPSInputAllocated(PSInputNum);
2665 if (Arg->Used)
2666 Info->markPSInputEnabled(PSInputNum);
2667
2668 ++PSInputNum;
2669 }
2670
2671 Splits.push_back(*Arg);
2672 }
2673}
2674
2675// Allocate special inputs passed in VGPRs.
2677 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2678 SIMachineFunctionInfo &Info) const {
2679 const LLT S32 = LLT::scalar(32);
2681
2682 if (Info.hasWorkItemIDX()) {
2683 Register Reg = AMDGPU::VGPR0;
2684 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2685
2686 CCInfo.AllocateReg(Reg);
2687 unsigned Mask =
2688 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2689 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2690 }
2691
2692 if (Info.hasWorkItemIDY()) {
2693 assert(Info.hasWorkItemIDX());
2694 if (Subtarget->hasPackedTID()) {
2695 Info.setWorkItemIDY(
2696 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2697 } else {
2698 unsigned Reg = AMDGPU::VGPR1;
2699 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2700
2701 CCInfo.AllocateReg(Reg);
2702 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2703 }
2704 }
2705
2706 if (Info.hasWorkItemIDZ()) {
2707 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2708 if (Subtarget->hasPackedTID()) {
2709 Info.setWorkItemIDZ(
2710 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2711 } else {
2712 unsigned Reg = AMDGPU::VGPR2;
2713 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2714
2715 CCInfo.AllocateReg(Reg);
2716 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2717 }
2718 }
2719}
2720
2721// Try to allocate a VGPR at the end of the argument list, or if no argument
2722// VGPRs are left allocating a stack slot.
2723// If \p Mask is is given it indicates bitfield position in the register.
2724// If \p Arg is given use it with new ]p Mask instead of allocating new.
2725static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2726 ArgDescriptor Arg = ArgDescriptor()) {
2727 if (Arg.isSet())
2728 return ArgDescriptor::createArg(Arg, Mask);
2729
2730 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2731 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2732 if (RegIdx == ArgVGPRs.size()) {
2733 // Spill to stack required.
2734 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2735
2736 return ArgDescriptor::createStack(Offset, Mask);
2737 }
2738
2739 unsigned Reg = ArgVGPRs[RegIdx];
2740 Reg = CCInfo.AllocateReg(Reg);
2741 assert(Reg != AMDGPU::NoRegister);
2742
2743 MachineFunction &MF = CCInfo.getMachineFunction();
2744 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2745 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2746 return ArgDescriptor::createRegister(Reg, Mask);
2747}
2748
2750 const TargetRegisterClass *RC,
2751 unsigned NumArgRegs) {
2752 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2753 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2754 if (RegIdx == ArgSGPRs.size())
2755 report_fatal_error("ran out of SGPRs for arguments");
2756
2757 unsigned Reg = ArgSGPRs[RegIdx];
2758 Reg = CCInfo.AllocateReg(Reg);
2759 assert(Reg != AMDGPU::NoRegister);
2760
2761 MachineFunction &MF = CCInfo.getMachineFunction();
2762 MF.addLiveIn(Reg, RC);
2764}
2765
2766// If this has a fixed position, we still should allocate the register in the
2767// CCInfo state. Technically we could get away with this for values passed
2768// outside of the normal argument range.
2770 const TargetRegisterClass *RC,
2771 MCRegister Reg) {
2772 Reg = CCInfo.AllocateReg(Reg);
2773 assert(Reg != AMDGPU::NoRegister);
2774 MachineFunction &MF = CCInfo.getMachineFunction();
2775 MF.addLiveIn(Reg, RC);
2776}
2777
2778static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2779 if (Arg) {
2780 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2781 Arg.getRegister());
2782 } else
2783 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2784}
2785
2786static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2787 if (Arg) {
2788 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2789 Arg.getRegister());
2790 } else
2791 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2792}
2793
2794/// Allocate implicit function VGPR arguments at the end of allocated user
2795/// arguments.
2797 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2798 SIMachineFunctionInfo &Info) const {
2799 const unsigned Mask = 0x3ff;
2800 ArgDescriptor Arg;
2801
2802 if (Info.hasWorkItemIDX()) {
2803 Arg = allocateVGPR32Input(CCInfo, Mask);
2804 Info.setWorkItemIDX(Arg);
2805 }
2806
2807 if (Info.hasWorkItemIDY()) {
2808 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2809 Info.setWorkItemIDY(Arg);
2810 }
2811
2812 if (Info.hasWorkItemIDZ())
2813 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2814}
2815
2816/// Allocate implicit function VGPR arguments in fixed registers.
2818 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2819 SIMachineFunctionInfo &Info) const {
2820 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2821 if (!Reg)
2822 report_fatal_error("failed to allocate VGPR for implicit arguments");
2823
2824 const unsigned Mask = 0x3ff;
2825 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2826 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2827 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2828}
2829
2831 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2832 SIMachineFunctionInfo &Info) const {
2833 auto &ArgInfo = Info.getArgInfo();
2834 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2835
2836 // TODO: Unify handling with private memory pointers.
2837 if (UserSGPRInfo.hasDispatchPtr())
2838 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2839
2840 if (UserSGPRInfo.hasQueuePtr())
2841 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2842
2843 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2844 // constant offset from the kernarg segment.
2845 if (Info.hasImplicitArgPtr())
2846 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2847
2848 if (UserSGPRInfo.hasDispatchID())
2849 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2850
2851 // flat_scratch_init is not applicable for non-kernel functions.
2852
2853 if (Info.hasWorkGroupIDX())
2854 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2855
2856 if (Info.hasWorkGroupIDY())
2857 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2858
2859 if (Info.hasWorkGroupIDZ())
2860 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2861
2862 if (Info.hasLDSKernelId())
2863 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2864}
2865
2866// Allocate special inputs passed in user SGPRs.
2868 MachineFunction &MF,
2869 const SIRegisterInfo &TRI,
2870 SIMachineFunctionInfo &Info) const {
2871 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2872 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2873 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2874 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2875 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2876 }
2877
2878 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2879 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2880 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2881 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2882 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2883 }
2884
2885 if (UserSGPRInfo.hasDispatchPtr()) {
2886 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2887 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2888 CCInfo.AllocateReg(DispatchPtrReg);
2889 }
2890
2891 if (UserSGPRInfo.hasQueuePtr()) {
2892 Register QueuePtrReg = Info.addQueuePtr(TRI);
2893 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2894 CCInfo.AllocateReg(QueuePtrReg);
2895 }
2896
2897 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2899 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2900 CCInfo.AllocateReg(InputPtrReg);
2901
2902 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2903 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2904 }
2905
2906 if (UserSGPRInfo.hasDispatchID()) {
2907 Register DispatchIDReg = Info.addDispatchID(TRI);
2908 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2909 CCInfo.AllocateReg(DispatchIDReg);
2910 }
2911
2912 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2913 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2914 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2915 CCInfo.AllocateReg(FlatScratchInitReg);
2916 }
2917
2918 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2919 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2920 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2921 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2922 }
2923
2924 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2925 // these from the dispatch pointer.
2926}
2927
2928// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2929// sequential starting from the first argument.
2931 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2933 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2934 Function &F = MF.getFunction();
2935 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2936 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2937 bool InPreloadSequence = true;
2938 unsigned InIdx = 0;
2939 bool AlignedForImplictArgs = false;
2940 unsigned ImplicitArgOffset = 0;
2941 for (auto &Arg : F.args()) {
2942 if (!InPreloadSequence || !Arg.hasInRegAttr())
2943 break;
2944
2945 unsigned ArgIdx = Arg.getArgNo();
2946 // Don't preload non-original args or parts not in the current preload
2947 // sequence.
2948 if (InIdx < Ins.size() &&
2949 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2950 break;
2951
2952 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2953 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2954 InIdx++) {
2955 assert(ArgLocs[ArgIdx].isMemLoc());
2956 auto &ArgLoc = ArgLocs[InIdx];
2957 const Align KernelArgBaseAlign = Align(16);
2958 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2959 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2960 unsigned NumAllocSGPRs =
2961 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2962
2963 // Fix alignment for hidden arguments.
2964 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
2965 if (!AlignedForImplictArgs) {
2966 ImplicitArgOffset =
2967 alignTo(LastExplicitArgOffset,
2968 Subtarget->getAlignmentForImplicitArgPtr()) -
2969 LastExplicitArgOffset;
2970 AlignedForImplictArgs = true;
2971 }
2972 ArgOffset += ImplicitArgOffset;
2973 }
2974
2975 // Arg is preloaded into the previous SGPR.
2976 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2977 assert(InIdx >= 1 && "No previous SGPR");
2978 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2979 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2980 continue;
2981 }
2982
2983 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2984 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2985 // Check for free user SGPRs for preloading.
2986 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
2987 InPreloadSequence = false;
2988 break;
2989 }
2990
2991 // Preload this argument.
2992 const TargetRegisterClass *RC =
2993 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2994 SmallVectorImpl<MCRegister> *PreloadRegs =
2995 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2996
2997 if (PreloadRegs->size() > 1)
2998 RC = &AMDGPU::SGPR_32RegClass;
2999 for (auto &Reg : *PreloadRegs) {
3000 assert(Reg);
3001 MF.addLiveIn(Reg, RC);
3002 CCInfo.AllocateReg(Reg);
3003 }
3004
3005 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3006 }
3007 }
3008}
3009
3011 const SIRegisterInfo &TRI,
3012 SIMachineFunctionInfo &Info) const {
3013 // Always allocate this last since it is a synthetic preload.
3014 if (Info.hasLDSKernelId()) {
3015 Register Reg = Info.addLDSKernelId();
3016 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3017 CCInfo.AllocateReg(Reg);
3018 }
3019}
3020
3021// Allocate special input registers that are initialized per-wave.
3024 CallingConv::ID CallConv,
3025 bool IsShader) const {
3026 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3027 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3028 // Note: user SGPRs are handled by the front-end for graphics shaders
3029 // Pad up the used user SGPRs with dead inputs.
3030
3031 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3032 // before enabling architected SGPRs for workgroup IDs.
3033 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3034
3035 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3036 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3037 // rely on it to reach 16 since if we end up having no stack usage, it will
3038 // not really be added.
3039 unsigned NumRequiredSystemSGPRs =
3040 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3041 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3042 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3043 Register Reg = Info.addReservedUserSGPR();
3044 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3045 CCInfo.AllocateReg(Reg);
3046 }
3047 }
3048
3049 if (!HasArchitectedSGPRs) {
3050 if (Info.hasWorkGroupIDX()) {
3051 Register Reg = Info.addWorkGroupIDX();
3052 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3053 CCInfo.AllocateReg(Reg);
3054 }
3055
3056 if (Info.hasWorkGroupIDY()) {
3057 Register Reg = Info.addWorkGroupIDY();
3058 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3059 CCInfo.AllocateReg(Reg);
3060 }
3061
3062 if (Info.hasWorkGroupIDZ()) {
3063 Register Reg = Info.addWorkGroupIDZ();
3064 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3065 CCInfo.AllocateReg(Reg);
3066 }
3067 }
3068
3069 if (Info.hasWorkGroupInfo()) {
3070 Register Reg = Info.addWorkGroupInfo();
3071 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 CCInfo.AllocateReg(Reg);
3073 }
3074
3075 if (Info.hasPrivateSegmentWaveByteOffset()) {
3076 // Scratch wave offset passed in system SGPR.
3077 unsigned PrivateSegmentWaveByteOffsetReg;
3078
3079 if (IsShader) {
3080 PrivateSegmentWaveByteOffsetReg =
3081 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3082
3083 // This is true if the scratch wave byte offset doesn't have a fixed
3084 // location.
3085 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3086 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3087 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3088 }
3089 } else
3090 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3091
3092 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3093 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3094 }
3095
3096 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3097 Info.getNumPreloadedSGPRs() >= 16);
3098}
3099
3101 MachineFunction &MF,
3102 const SIRegisterInfo &TRI,
3104 // Now that we've figured out where the scratch register inputs are, see if
3105 // should reserve the arguments and use them directly.
3106 MachineFrameInfo &MFI = MF.getFrameInfo();
3107 bool HasStackObjects = MFI.hasStackObjects();
3108 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3109
3110 // Record that we know we have non-spill stack objects so we don't need to
3111 // check all stack objects later.
3112 if (HasStackObjects)
3113 Info.setHasNonSpillStackObjects(true);
3114
3115 // Everything live out of a block is spilled with fast regalloc, so it's
3116 // almost certain that spilling will be required.
3117 if (TM.getOptLevel() == CodeGenOptLevel::None)
3118 HasStackObjects = true;
3119
3120 // For now assume stack access is needed in any callee functions, so we need
3121 // the scratch registers to pass in.
3122 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3123
3124 if (!ST.enableFlatScratch()) {
3125 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3126 // If we have stack objects, we unquestionably need the private buffer
3127 // resource. For the Code Object V2 ABI, this will be the first 4 user
3128 // SGPR inputs. We can reserve those and use them directly.
3129
3130 Register PrivateSegmentBufferReg =
3132 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3133 } else {
3134 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3135 // We tentatively reserve the last registers (skipping the last registers
3136 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3137 // we'll replace these with the ones immediately after those which were
3138 // really allocated. In the prologue copies will be inserted from the
3139 // argument to these reserved registers.
3140
3141 // Without HSA, relocations are used for the scratch pointer and the
3142 // buffer resource setup is always inserted in the prologue. Scratch wave
3143 // offset is still in an input SGPR.
3144 Info.setScratchRSrcReg(ReservedBufferReg);
3145 }
3146 }
3147
3149
3150 // For entry functions we have to set up the stack pointer if we use it,
3151 // whereas non-entry functions get this "for free". This means there is no
3152 // intrinsic advantage to using S32 over S34 in cases where we do not have
3153 // calls but do need a frame pointer (i.e. if we are requested to have one
3154 // because frame pointer elimination is disabled). To keep things simple we
3155 // only ever use S32 as the call ABI stack pointer, and so using it does not
3156 // imply we need a separate frame pointer.
3157 //
3158 // Try to use s32 as the SP, but move it if it would interfere with input
3159 // arguments. This won't work with calls though.
3160 //
3161 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3162 // registers.
3163 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3164 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3165 } else {
3167
3168 if (MFI.hasCalls())
3169 report_fatal_error("call in graphics shader with too many input SGPRs");
3170
3171 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3172 if (!MRI.isLiveIn(Reg)) {
3173 Info.setStackPtrOffsetReg(Reg);
3174 break;
3175 }
3176 }
3177
3178 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3179 report_fatal_error("failed to find register for SP");
3180 }
3181
3182 // hasFP should be accurate for entry functions even before the frame is
3183 // finalized, because it does not rely on the known stack size, only
3184 // properties like whether variable sized objects are present.
3185 if (ST.getFrameLowering()->hasFP(MF)) {
3186 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3187 }
3188}
3189
3192 return !Info->isEntryFunction();
3193}
3194
3196
3198 MachineBasicBlock *Entry,
3199 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3201
3202 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3203 if (!IStart)
3204 return;
3205
3206 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3207 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3208 MachineBasicBlock::iterator MBBI = Entry->begin();
3209 for (const MCPhysReg *I = IStart; *I; ++I) {
3210 const TargetRegisterClass *RC = nullptr;
3211 if (AMDGPU::SReg_64RegClass.contains(*I))
3212 RC = &AMDGPU::SGPR_64RegClass;
3213 else if (AMDGPU::SReg_32RegClass.contains(*I))
3214 RC = &AMDGPU::SGPR_32RegClass;
3215 else
3216 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3217
3218 Register NewVR = MRI->createVirtualRegister(RC);
3219 // Create copy from CSR to a virtual register.
3220 Entry->addLiveIn(*I);
3221 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3222 .addReg(*I);
3223
3224 // Insert the copy-back instructions right before the terminator.
3225 for (auto *Exit : Exits)
3226 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3227 TII->get(TargetOpcode::COPY), *I)
3228 .addReg(NewVR);
3229 }
3230}
3231
3233 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3234 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3235 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3237
3239 const Function &Fn = MF.getFunction();
3242 bool IsError = false;
3243
3244 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3246 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3247 IsError = true;
3248 }
3249
3252 BitVector Skipped(Ins.size());
3253 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3254 *DAG.getContext());
3255
3256 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3257 bool IsKernel = AMDGPU::isKernel(CallConv);
3258 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3259
3260 if (IsGraphics) {
3261 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3262 assert(!UserSGPRInfo.hasDispatchPtr() &&
3263 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3264 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3265 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3266 (void)UserSGPRInfo;
3267 if (!Subtarget->enableFlatScratch())
3268 assert(!UserSGPRInfo.hasFlatScratchInit());
3269 if ((CallConv != CallingConv::AMDGPU_CS &&
3270 CallConv != CallingConv::AMDGPU_Gfx &&
3271 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3272 !Subtarget->hasArchitectedSGPRs())
3273 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3274 !Info->hasWorkGroupIDZ());
3275 }
3276
3277 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3278
3279 if (CallConv == CallingConv::AMDGPU_PS) {
3280 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3281
3282 // At least one interpolation mode must be enabled or else the GPU will
3283 // hang.
3284 //
3285 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3286 // set PSInputAddr, the user wants to enable some bits after the compilation
3287 // based on run-time states. Since we can't know what the final PSInputEna
3288 // will look like, so we shouldn't do anything here and the user should take
3289 // responsibility for the correct programming.
3290 //
3291 // Otherwise, the following restrictions apply:
3292 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3293 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3294 // enabled too.
3295 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3296 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3297 CCInfo.AllocateReg(AMDGPU::VGPR0);
3298 CCInfo.AllocateReg(AMDGPU::VGPR1);
3299 Info->markPSInputAllocated(0);
3300 Info->markPSInputEnabled(0);
3301 }
3302 if (Subtarget->isAmdPalOS()) {
3303 // For isAmdPalOS, the user does not enable some bits after compilation
3304 // based on run-time states; the register values being generated here are
3305 // the final ones set in hardware. Therefore we need to apply the
3306 // workaround to PSInputAddr and PSInputEnable together. (The case where
3307 // a bit is set in PSInputAddr but not PSInputEnable is where the
3308 // frontend set up an input arg for a particular interpolation mode, but
3309 // nothing uses that input arg. Really we should have an earlier pass
3310 // that removes such an arg.)
3311 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3312 if ((PsInputBits & 0x7F) == 0 ||
3313 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3314 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3315 }
3316 } else if (IsKernel) {
3317 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3318 } else {
3319 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3320 Ins.end());
3321 }
3322
3323 if (IsKernel)
3324 analyzeFormalArgumentsCompute(CCInfo, Ins);
3325
3326 if (IsEntryFunc) {
3327 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3328 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3329 if (IsKernel && Subtarget->hasKernargPreload())
3330 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3331
3332 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3333 } else if (!IsGraphics) {
3334 // For the fixed ABI, pass workitem IDs in the last argument register.
3335 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3336
3337 // FIXME: Sink this into allocateSpecialInputSGPRs
3338 if (!Subtarget->enableFlatScratch())
3339 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3340
3341 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3342 }
3343
3344 if (!IsKernel) {
3345 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3346 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3347
3348 // This assumes the registers are allocated by CCInfo in ascending order
3349 // with no gaps.
3350 Info->setNumWaveDispatchSGPRs(
3351 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3352 Info->setNumWaveDispatchVGPRs(
3353 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3354 } else if (Info->getNumKernargPreloadedSGPRs()) {
3355 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3356 }
3357
3359
3360 if (IsWholeWaveFunc) {
3362 {MVT::i1, MVT::Other}, Chain);
3363 InVals.push_back(Setup.getValue(0));
3364 Chains.push_back(Setup.getValue(1));
3365 }
3366
3367 // FIXME: This is the minimum kernel argument alignment. We should improve
3368 // this to the maximum alignment of the arguments.
3369 //
3370 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3371 // kern arg offset.
3372 const Align KernelArgBaseAlign = Align(16);
3373
3374 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3375 ++i) {
3376 const ISD::InputArg &Arg = Ins[i];
3377 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3378 InVals.push_back(DAG.getPOISON(Arg.VT));
3379 continue;
3380 }
3381
3382 CCValAssign &VA = ArgLocs[ArgIdx++];
3383 MVT VT = VA.getLocVT();
3384
3385 if (IsEntryFunc && VA.isMemLoc()) {
3386 VT = Ins[i].VT;
3387 EVT MemVT = VA.getLocVT();
3388
3389 const uint64_t Offset = VA.getLocMemOffset();
3390 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3391
3392 if (Arg.Flags.isByRef()) {
3393 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3394
3395 const GCNTargetMachine &TM =
3396 static_cast<const GCNTargetMachine &>(getTargetMachine());
3397 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3398 Arg.Flags.getPointerAddrSpace())) {
3401 }
3402
3403 InVals.push_back(Ptr);
3404 continue;
3405 }
3406
3407 SDValue NewArg;
3408 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3409 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3410 // In this case the argument is packed into the previous preload SGPR.
3411 int64_t AlignDownOffset = alignDown(Offset, 4);
3412 int64_t OffsetDiff = Offset - AlignDownOffset;
3413 EVT IntVT = MemVT.changeTypeToInteger();
3414
3415 const SIMachineFunctionInfo *Info =
3418 Register Reg =
3419 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3420
3421 assert(Reg);
3422 Register VReg = MRI.getLiveInVirtReg(Reg);
3423 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3424
3425 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3426 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3427
3428 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3429 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3430 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3431 Ins[i].Flags.isSExt(), &Ins[i]);
3432
3433 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3434 } else {
3435 const SIMachineFunctionInfo *Info =
3438 const SmallVectorImpl<MCRegister> &PreloadRegs =
3439 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3440
3441 SDValue Copy;
3442 if (PreloadRegs.size() == 1) {
3443 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3444 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3445 NewArg = DAG.getCopyFromReg(
3446 Chain, DL, VReg,
3448 TRI->getRegSizeInBits(*RC)));
3449
3450 } else {
3451 // If the kernarg alignment does not match the alignment of the SGPR
3452 // tuple RC that can accommodate this argument, it will be built up
3453 // via copies from from the individual SGPRs that the argument was
3454 // preloaded to.
3456 for (auto Reg : PreloadRegs) {
3457 Register VReg = MRI.getLiveInVirtReg(Reg);
3458 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3459 Elts.push_back(Copy);
3460 }
3461 NewArg =
3462 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3463 PreloadRegs.size()),
3464 DL, Elts);
3465 }
3466
3467 // If the argument was preloaded to multiple consecutive 32-bit
3468 // registers because of misalignment between addressable SGPR tuples
3469 // and the argument size, we can still assume that because of kernarg
3470 // segment alignment restrictions that NewArg's size is the same as
3471 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3472 // truncate since we cannot preload to less than a single SGPR and the
3473 // MemVT may be smaller.
3474 EVT MemVTInt =
3476 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3477 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3478
3479 NewArg = DAG.getBitcast(MemVT, NewArg);
3480 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3481 Ins[i].Flags.isSExt(), &Ins[i]);
3482 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3483 }
3484 } else {
3485 // Hidden arguments that are in the kernel signature must be preloaded
3486 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3487 // the argument list and is not preloaded.
3488 if (Arg.isOrigArg()) {
3489 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3490 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3492 *OrigArg->getParent(),
3493 "hidden argument in kernel signature was not preloaded",
3494 DL.getDebugLoc()));
3495 }
3496 }
3497
3498 NewArg =
3499 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3500 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3501 }
3502 Chains.push_back(NewArg.getValue(1));
3503
3504 auto *ParamTy =
3505 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3506 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3507 ParamTy &&
3508 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3509 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3510 // On SI local pointers are just offsets into LDS, so they are always
3511 // less than 16-bits. On CI and newer they could potentially be
3512 // real pointers, so we can't guarantee their size.
3513 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3514 DAG.getValueType(MVT::i16));
3515 }
3516
3517 InVals.push_back(NewArg);
3518 continue;
3519 }
3520 if (!IsEntryFunc && VA.isMemLoc()) {
3521 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3522 InVals.push_back(Val);
3523 if (!Arg.Flags.isByVal())
3524 Chains.push_back(Val.getValue(1));
3525 continue;
3526 }
3527
3528 assert(VA.isRegLoc() && "Parameter must be in a register!");
3529
3530 Register Reg = VA.getLocReg();
3531 const TargetRegisterClass *RC = nullptr;
3532 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3533 RC = &AMDGPU::VGPR_32RegClass;
3534 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3535 RC = &AMDGPU::SGPR_32RegClass;
3536 else
3537 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3538
3539 Reg = MF.addLiveIn(Reg, RC);
3540 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3541
3542 if (Arg.Flags.isSRet()) {
3543 // The return object should be reasonably addressable.
3544
3545 // FIXME: This helps when the return is a real sret. If it is a
3546 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3547 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3548 unsigned NumBits =
3550 Val = DAG.getNode(
3551 ISD::AssertZext, DL, VT, Val,
3552 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3553 }
3554
3555 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3556 InVals.push_back(Val);
3557 }
3558
3559 // Start adding system SGPRs.
3560 if (IsEntryFunc)
3561 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3562
3563 // DAG.getPass() returns nullptr when using new pass manager.
3564 // TODO: Use DAG.getMFAM() to access analysis result.
3565 if (DAG.getPass()) {
3566 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3567 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3568 }
3569
3570 unsigned StackArgSize = CCInfo.getStackSize();
3571 Info->setBytesInStackArgArea(StackArgSize);
3572
3573 return Chains.empty() ? Chain
3574 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3575}
3576
3577// TODO: If return values can't fit in registers, we should return as many as
3578// possible in registers before passing on stack.
3580 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3581 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3582 const Type *RetTy) const {
3583 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3584 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3585 // for shaders. Vector types should be explicitly handled by CC.
3586 if (AMDGPU::isEntryFunctionCC(CallConv))
3587 return true;
3588
3590 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3591 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3592 return false;
3593
3594 // We must use the stack if return would require unavailable registers.
3595 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3596 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3597 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3598 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3599 return false;
3600
3601 return true;
3602}
3603
3604SDValue
3606 bool isVarArg,
3608 const SmallVectorImpl<SDValue> &OutVals,
3609 const SDLoc &DL, SelectionDAG &DAG) const {
3613
3614 if (AMDGPU::isKernel(CallConv)) {
3615 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3616 OutVals, DL, DAG);
3617 }
3618
3619 bool IsShader = AMDGPU::isShader(CallConv);
3620
3621 Info->setIfReturnsVoid(Outs.empty());
3622 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3623
3624 // CCValAssign - represent the assignment of the return value to a location.
3626
3627 // CCState - Info about the registers and stack slots.
3628 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3629 *DAG.getContext());
3630
3631 // Analyze outgoing return values.
3632 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3633
3634 SDValue Glue;
3636 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3637
3638 SDValue ReadFirstLane =
3639 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3640 // Copy the result values into the output registers.
3641 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3642 ++I, ++RealRVLocIdx) {
3643 CCValAssign &VA = RVLocs[I];
3644 assert(VA.isRegLoc() && "Can only return in registers!");
3645 // TODO: Partially return in registers if return values don't fit.
3646 SDValue Arg = OutVals[RealRVLocIdx];
3647
3648 // Copied from other backends.
3649 switch (VA.getLocInfo()) {
3650 case CCValAssign::Full:
3651 break;
3652 case CCValAssign::BCvt:
3653 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3654 break;
3655 case CCValAssign::SExt:
3656 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3657 break;
3658 case CCValAssign::ZExt:
3659 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3660 break;
3661 case CCValAssign::AExt:
3662 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3663 break;
3664 default:
3665 llvm_unreachable("Unknown loc info!");
3666 }
3667 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3669 ReadFirstLane, Arg);
3670 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3671 Glue = Chain.getValue(1);
3672 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3673 }
3674
3675 // FIXME: Does sret work properly?
3676 if (!Info->isEntryFunction()) {
3677 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3678 const MCPhysReg *I =
3679 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3680 if (I) {
3681 for (; *I; ++I) {
3682 if (AMDGPU::SReg_64RegClass.contains(*I))
3683 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3684 else if (AMDGPU::SReg_32RegClass.contains(*I))
3685 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3686 else
3687 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3688 }
3689 }
3690 }
3691
3692 // Update chain and glue.
3693 RetOps[0] = Chain;
3694 if (Glue.getNode())
3695 RetOps.push_back(Glue);
3696
3697 unsigned Opc = AMDGPUISD::ENDPGM;
3698 if (!IsWaveEnd)
3699 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3700 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3702 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3703}
3704
3706 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3707 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3708 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3709 SDValue ThisVal) const {
3710 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3711
3712 // Assign locations to each value returned by this call.
3714 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3715 *DAG.getContext());
3716 CCInfo.AnalyzeCallResult(Ins, RetCC);
3717
3718 // Copy all of the result registers out of their specified physreg.
3719 for (CCValAssign VA : RVLocs) {
3720 SDValue Val;
3721
3722 if (VA.isRegLoc()) {
3723 Val =
3724 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3725 Chain = Val.getValue(1);
3726 InGlue = Val.getValue(2);
3727 } else if (VA.isMemLoc()) {
3728 report_fatal_error("TODO: return values in memory");
3729 } else
3730 llvm_unreachable("unknown argument location type");
3731
3732 switch (VA.getLocInfo()) {
3733 case CCValAssign::Full:
3734 break;
3735 case CCValAssign::BCvt:
3736 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3737 break;
3738 case CCValAssign::ZExt:
3739 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3740 DAG.getValueType(VA.getValVT()));
3741 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3742 break;
3743 case CCValAssign::SExt:
3744 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3745 DAG.getValueType(VA.getValVT()));
3746 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3747 break;
3748 case CCValAssign::AExt:
3749 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3750 break;
3751 default:
3752 llvm_unreachable("Unknown loc info!");
3753 }
3754
3755 InVals.push_back(Val);
3756 }
3757
3758 return Chain;
3759}
3760
3761// Add code to pass special inputs required depending on used features separate
3762// from the explicit user arguments present in the IR.
3764 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3765 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3766 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3767 // If we don't have a call site, this was a call inserted by
3768 // legalization. These can never use special inputs.
3769 if (!CLI.CB)
3770 return;
3771
3772 SelectionDAG &DAG = CLI.DAG;
3773 const SDLoc &DL = CLI.DL;
3774 const Function &F = DAG.getMachineFunction().getFunction();
3775
3776 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3777 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3778
3779 const AMDGPUFunctionArgInfo *CalleeArgInfo =
3781 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3782 // DAG.getPass() returns nullptr when using new pass manager.
3783 // TODO: Use DAG.getMFAM() to access analysis result.
3784 if (DAG.getPass()) {
3785 auto &ArgUsageInfo =
3787 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3788 }
3789 }
3790
3791 // TODO: Unify with private memory register handling. This is complicated by
3792 // the fact that at least in kernels, the input argument is not necessarily
3793 // in the same location as the input.
3794 // clang-format off
3795 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3796 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3797 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3798 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3799 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3800 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3801 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3802 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3803 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3804 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3805 };
3806 // clang-format on
3807
3808 for (auto [InputID, Attrs] : ImplicitAttrs) {
3809 // If the callee does not use the attribute value, skip copying the value.
3810 if (all_of(Attrs, [&](StringRef Attr) {
3811 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3812 }))
3813 continue;
3814
3815 const auto [OutgoingArg, ArgRC, ArgTy] =
3816 CalleeArgInfo->getPreloadedValue(InputID);
3817 if (!OutgoingArg)
3818 continue;
3819
3820 const auto [IncomingArg, IncomingArgRC, Ty] =
3821 CallerArgInfo.getPreloadedValue(InputID);
3822 assert(IncomingArgRC == ArgRC);
3823
3824 // All special arguments are ints for now.
3825 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3826 SDValue InputReg;
3827
3828 if (IncomingArg) {
3829 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3830 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3831 // The implicit arg ptr is special because it doesn't have a corresponding
3832 // input for kernels, and is computed from the kernarg segment pointer.
3833 InputReg = getImplicitArgPtr(DAG, DL);
3834 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3835 std::optional<uint32_t> Id =
3837 if (Id.has_value()) {
3838 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3839 } else {
3840 InputReg = DAG.getPOISON(ArgVT);
3841 }
3842 } else {
3843 // We may have proven the input wasn't needed, although the ABI is
3844 // requiring it. We just need to allocate the register appropriately.
3845 InputReg = DAG.getPOISON(ArgVT);
3846 }
3847
3848 if (OutgoingArg->isRegister()) {
3849 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3850 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3851 report_fatal_error("failed to allocate implicit input argument");
3852 } else {
3853 unsigned SpecialArgOffset =
3854 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3855 SDValue ArgStore =
3856 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3857 MemOpChains.push_back(ArgStore);
3858 }
3859 }
3860
3861 // Pack workitem IDs into a single register or pass it as is if already
3862 // packed.
3863
3864 auto [OutgoingArg, ArgRC, Ty] =
3866 if (!OutgoingArg)
3867 std::tie(OutgoingArg, ArgRC, Ty) =
3869 if (!OutgoingArg)
3870 std::tie(OutgoingArg, ArgRC, Ty) =
3872 if (!OutgoingArg)
3873 return;
3874
3875 const ArgDescriptor *IncomingArgX = std::get<0>(
3877 const ArgDescriptor *IncomingArgY = std::get<0>(
3879 const ArgDescriptor *IncomingArgZ = std::get<0>(
3881
3882 SDValue InputReg;
3883 SDLoc SL;
3884
3885 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3886 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3887 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3888
3889 // If incoming ids are not packed we need to pack them.
3890 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3891 NeedWorkItemIDX) {
3892 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3893 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3894 } else {
3895 InputReg = DAG.getConstant(0, DL, MVT::i32);
3896 }
3897 }
3898
3899 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3900 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3901 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3902 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3903 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3904 InputReg = InputReg.getNode()
3905 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
3906 : Y;
3907 }
3908
3909 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3910 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3911 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3912 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3913 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3914 InputReg = InputReg.getNode()
3915 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
3916 : Z;
3917 }
3918
3919 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3920 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3921 // We're in a situation where the outgoing function requires the workitem
3922 // ID, but the calling function does not have it (e.g a graphics function
3923 // calling a C calling convention function). This is illegal, but we need
3924 // to produce something.
3925 InputReg = DAG.getPOISON(MVT::i32);
3926 } else {
3927 // Workitem ids are already packed, any of present incoming arguments
3928 // will carry all required fields.
3929 ArgDescriptor IncomingArg =
3930 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
3931 : IncomingArgY ? *IncomingArgY
3932 : *IncomingArgZ,
3933 ~0u);
3934 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3935 }
3936 }
3937
3938 if (OutgoingArg->isRegister()) {
3939 if (InputReg)
3940 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3941
3942 CCInfo.AllocateReg(OutgoingArg->getRegister());
3943 } else {
3944 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3945 if (InputReg) {
3946 SDValue ArgStore =
3947 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3948 MemOpChains.push_back(ArgStore);
3949 }
3950 }
3951}
3952
3954 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3956 const SmallVectorImpl<SDValue> &OutVals,
3957 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3958 if (AMDGPU::isChainCC(CalleeCC))
3959 return true;
3960
3961 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
3962 return false;
3963
3964 // For a divergent call target, we need to do a waterfall loop over the
3965 // possible callees which precludes us from using a simple jump.
3966 if (Callee->isDivergent())
3967 return false;
3968
3970 const Function &CallerF = MF.getFunction();
3971 CallingConv::ID CallerCC = CallerF.getCallingConv();
3973 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3974
3975 // Kernels aren't callable, and don't have a live in return address so it
3976 // doesn't make sense to do a tail call with entry functions.
3977 if (!CallerPreserved)
3978 return false;
3979
3980 bool CCMatch = CallerCC == CalleeCC;
3981
3983 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
3984 return true;
3985 return false;
3986 }
3987
3988 // TODO: Can we handle var args?
3989 if (IsVarArg)
3990 return false;
3991
3992 for (const Argument &Arg : CallerF.args()) {
3993 if (Arg.hasByValAttr())
3994 return false;
3995 }
3996
3997 LLVMContext &Ctx = *DAG.getContext();
3998
3999 // Check that the call results are passed in the same way.
4000 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4001 CCAssignFnForCall(CalleeCC, IsVarArg),
4002 CCAssignFnForCall(CallerCC, IsVarArg)))
4003 return false;
4004
4005 // The callee has to preserve all registers the caller needs to preserve.
4006 if (!CCMatch) {
4007 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4008 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4009 return false;
4010 }
4011
4012 // Nothing more to check if the callee is taking no arguments.
4013 if (Outs.empty())
4014 return true;
4015
4017 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4018
4019 // FIXME: We are not allocating special input registers, so we will be
4020 // deciding based on incorrect register assignments.
4021 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4022
4023 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4024 // If the stack arguments for this call do not fit into our own save area then
4025 // the call cannot be made tail.
4026 // TODO: Is this really necessary?
4027 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4028 return false;
4029
4030 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4031 // FIXME: What about inreg arguments that end up passed in memory?
4032 if (!CCVA.isRegLoc())
4033 continue;
4034
4035 // If we are passing an argument in an SGPR, and the value is divergent,
4036 // this call requires a waterfall loop.
4037 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4038 LLVM_DEBUG(
4039 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4040 << printReg(CCVA.getLocReg(), TRI) << '\n');
4041 return false;
4042 }
4043 }
4044
4045 const MachineRegisterInfo &MRI = MF.getRegInfo();
4046 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4047}
4048
4050 if (!CI->isTailCall())
4051 return false;
4052
4053 const Function *ParentFn = CI->getParent()->getParent();
4055 return false;
4056 return true;
4057}
4058
4059namespace {
4060// Chain calls have special arguments that we need to handle. These are
4061// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4062// arguments (index 0 and 1 respectively).
4063enum ChainCallArgIdx {
4064 Exec = 2,
4065 Flags,
4066 NumVGPRs,
4067 FallbackExec,
4068 FallbackCallee
4069};
4070} // anonymous namespace
4071
4072// The wave scratch offset register is used as the global base pointer.
4074 SmallVectorImpl<SDValue> &InVals) const {
4075 CallingConv::ID CallConv = CLI.CallConv;
4076 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4077
4078 SelectionDAG &DAG = CLI.DAG;
4079
4080 const SDLoc &DL = CLI.DL;
4081 SDValue Chain = CLI.Chain;
4082 SDValue Callee = CLI.Callee;
4083
4084 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4085 bool UsesDynamicVGPRs = false;
4086 if (IsChainCallConv) {
4087 // The last arguments should be the value that we need to put in EXEC,
4088 // followed by the flags and any other arguments with special meanings.
4089 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4090 // we don't treat them like the "real" arguments.
4091 auto RequestedExecIt =
4092 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4093 return Arg.OrigArgIndex == 2;
4094 });
4095 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4096
4097 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4098 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4099 CLI.OutVals.end());
4100 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4101
4102 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4103 "Haven't popped all the special args");
4104
4105 TargetLowering::ArgListEntry RequestedExecArg =
4106 CLI.Args[ChainCallArgIdx::Exec];
4107 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4108 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4109
4110 // Convert constants into TargetConstants, so they become immediate operands
4111 // instead of being selected into S_MOV.
4112 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4113 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4114 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4115 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4116 } else
4117 ChainCallSpecialArgs.push_back(Arg.Node);
4118 };
4119
4120 PushNodeOrTargetConstant(RequestedExecArg);
4121
4122 // Process any other special arguments depending on the value of the flags.
4123 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4124
4125 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4126 if (FlagsValue.isZero()) {
4127 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4128 return lowerUnhandledCall(CLI, InVals,
4129 "no additional args allowed if flags == 0");
4130 } else if (FlagsValue.isOneBitSet(0)) {
4131 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4132 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4133 }
4134
4135 if (!Subtarget->isWave32()) {
4136 return lowerUnhandledCall(
4137 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4138 }
4139
4140 UsesDynamicVGPRs = true;
4141 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4142 CLI.Args.end(), PushNodeOrTargetConstant);
4143 }
4144 }
4145
4147 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4149 bool &IsTailCall = CLI.IsTailCall;
4150 bool IsVarArg = CLI.IsVarArg;
4151 bool IsSibCall = false;
4153
4154 if (Callee.isUndef() || isNullConstant(Callee)) {
4155 if (!CLI.IsTailCall) {
4156 for (ISD::InputArg &Arg : CLI.Ins)
4157 InVals.push_back(DAG.getPOISON(Arg.VT));
4158 }
4159
4160 return Chain;
4161 }
4162
4163 if (IsVarArg) {
4164 return lowerUnhandledCall(CLI, InVals,
4165 "unsupported call to variadic function ");
4166 }
4167
4168 if (!CLI.CB)
4169 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4170
4171 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4172 return lowerUnhandledCall(CLI, InVals,
4173 "unsupported required tail call to function ");
4174 }
4175
4176 if (IsTailCall) {
4177 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4178 Outs, OutVals, Ins, DAG);
4179 if (!IsTailCall &&
4180 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4181 report_fatal_error("failed to perform tail call elimination on a call "
4182 "site marked musttail or on llvm.amdgcn.cs.chain");
4183 }
4184
4185 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4186
4187 // A sibling call is one where we're under the usual C ABI and not planning
4188 // to change that but can still do a tail call:
4189 if (!TailCallOpt && IsTailCall)
4190 IsSibCall = true;
4191
4192 if (IsTailCall)
4193 ++NumTailCalls;
4194 }
4195
4198 SmallVector<SDValue, 8> MemOpChains;
4199
4200 // Analyze operands of the call, assigning locations to each operand.
4202 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4203 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4204
4205 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4207 // With a fixed ABI, allocate fixed registers before user arguments.
4208 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4209 }
4210
4211 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4212
4213 // Get a count of how many bytes are to be pushed on the stack.
4214 unsigned NumBytes = CCInfo.getStackSize();
4215
4216 if (IsSibCall) {
4217 // Since we're not changing the ABI to make this a tail call, the memory
4218 // operands are already available in the caller's incoming argument space.
4219 NumBytes = 0;
4220 }
4221
4222 // FPDiff is the byte offset of the call's argument area from the callee's.
4223 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4224 // by this amount for a tail call. In a sibling call it must be 0 because the
4225 // caller will deallocate the entire stack and the callee still expects its
4226 // arguments to begin at SP+0. Completely unused for non-tail calls.
4227 int32_t FPDiff = 0;
4228 MachineFrameInfo &MFI = MF.getFrameInfo();
4229 auto *TRI = Subtarget->getRegisterInfo();
4230
4231 // Adjust the stack pointer for the new arguments...
4232 // These operations are automatically eliminated by the prolog/epilog pass
4233 if (!IsSibCall)
4234 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4235
4236 if (!IsSibCall || IsChainCallConv) {
4237 if (!Subtarget->enableFlatScratch()) {
4238 SmallVector<SDValue, 4> CopyFromChains;
4239
4240 // In the HSA case, this should be an identity copy.
4241 SDValue ScratchRSrcReg =
4242 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4243 RegsToPass.emplace_back(IsChainCallConv
4244 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4245 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4246 ScratchRSrcReg);
4247 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4248 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4249 }
4250 }
4251
4252 const unsigned NumSpecialInputs = RegsToPass.size();
4253
4254 MVT PtrVT = MVT::i32;
4255
4256 // Walk the register/memloc assignments, inserting copies/loads.
4257 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4258 CCValAssign &VA = ArgLocs[i];
4259 SDValue Arg = OutVals[i];
4260
4261 // Promote the value if needed.
4262 switch (VA.getLocInfo()) {
4263 case CCValAssign::Full:
4264 break;
4265 case CCValAssign::BCvt:
4266 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4267 break;
4268 case CCValAssign::ZExt:
4269 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4270 break;
4271 case CCValAssign::SExt:
4272 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4273 break;
4274 case CCValAssign::AExt:
4275 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4276 break;
4277 case CCValAssign::FPExt:
4278 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4279 break;
4280 default:
4281 llvm_unreachable("Unknown loc info!");
4282 }
4283
4284 if (VA.isRegLoc()) {
4285 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4286 } else {
4287 assert(VA.isMemLoc());
4288
4289 SDValue DstAddr;
4290 MachinePointerInfo DstInfo;
4291
4292 unsigned LocMemOffset = VA.getLocMemOffset();
4293 int32_t Offset = LocMemOffset;
4294
4295 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4296 MaybeAlign Alignment;
4297
4298 if (IsTailCall) {
4299 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4300 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4301 : VA.getValVT().getStoreSize();
4302
4303 // FIXME: We can have better than the minimum byval required alignment.
4304 Alignment =
4305 Flags.isByVal()
4306 ? Flags.getNonZeroByValAlign()
4307 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4308
4309 Offset = Offset + FPDiff;
4310 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4311
4312 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4313 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4314
4315 // Make sure any stack arguments overlapping with where we're storing
4316 // are loaded before this eventual operation. Otherwise they'll be
4317 // clobbered.
4318
4319 // FIXME: Why is this really necessary? This seems to just result in a
4320 // lot of code to copy the stack and write them back to the same
4321 // locations, which are supposed to be immutable?
4322 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4323 } else {
4324 // Stores to the argument stack area are relative to the stack pointer.
4325 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4326 MVT::i32);
4327 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4328 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4329 Alignment =
4330 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4331 }
4332
4333 if (Outs[i].Flags.isByVal()) {
4334 SDValue SizeNode =
4335 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4336 SDValue Cpy =
4337 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4338 Outs[i].Flags.getNonZeroByValAlign(),
4339 /*isVol = */ false, /*AlwaysInline = */ true,
4340 /*CI=*/nullptr, std::nullopt, DstInfo,
4342
4343 MemOpChains.push_back(Cpy);
4344 } else {
4345 SDValue Store =
4346 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4347 MemOpChains.push_back(Store);
4348 }
4349 }
4350 }
4351
4352 if (!MemOpChains.empty())
4353 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4354
4355 SDValue ReadFirstLaneID =
4356 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4357
4358 SDValue TokenGlue;
4359 if (CLI.ConvergenceControlToken) {
4360 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4362 }
4363
4364 // Build a sequence of copy-to-reg nodes chained together with token chain
4365 // and flag operands which copy the outgoing args into the appropriate regs.
4366 SDValue InGlue;
4367
4368 unsigned ArgIdx = 0;
4369 for (auto [Reg, Val] : RegsToPass) {
4370 if (ArgIdx++ >= NumSpecialInputs &&
4371 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4372 // For chain calls, the inreg arguments are required to be
4373 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4374 // they are uniform.
4375 //
4376 // For other calls, if an inreg arguments is known to be uniform,
4377 // speculatively insert a readfirstlane in case it is in a VGPR.
4378 //
4379 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4380 // value, so let that continue to produce invalid code.
4381
4382 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4383 if (TokenGlue)
4384 ReadfirstlaneArgs.push_back(TokenGlue);
4386 ReadfirstlaneArgs);
4387 }
4388
4389 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4390 InGlue = Chain.getValue(1);
4391 }
4392
4393 // We don't usually want to end the call-sequence here because we would tidy
4394 // the frame up *after* the call, however in the ABI-changing tail-call case
4395 // we've carefully laid out the parameters so that when sp is reset they'll be
4396 // in the correct location.
4397 if (IsTailCall && !IsSibCall) {
4398 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4399 InGlue = Chain.getValue(1);
4400 }
4401
4402 std::vector<SDValue> Ops({Chain});
4403
4404 // Add a redundant copy of the callee global which will not be legalized, as
4405 // we need direct access to the callee later.
4407 const GlobalValue *GV = GSD->getGlobal();
4408 Ops.push_back(Callee);
4409 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4410 } else {
4411 if (IsTailCall) {
4412 // isEligibleForTailCallOptimization considered whether the call target is
4413 // divergent, but we may still end up with a uniform value in a VGPR.
4414 // Insert a readfirstlane just in case.
4415 SDValue ReadFirstLaneID =
4416 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4417
4418 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4419 if (TokenGlue)
4420 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4421 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4422 ReadfirstlaneArgs);
4423 }
4424
4425 Ops.push_back(Callee);
4426 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4427 }
4428
4429 if (IsTailCall) {
4430 // Each tail call may have to adjust the stack by a different amount, so
4431 // this information must travel along with the operation for eventual
4432 // consumption by emitEpilogue.
4433 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4434 }
4435
4436 if (IsChainCallConv)
4437 llvm::append_range(Ops, ChainCallSpecialArgs);
4438
4439 // Add argument registers to the end of the list so that they are known live
4440 // into the call.
4441 for (auto &[Reg, Val] : RegsToPass)
4442 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4443
4444 // Add a register mask operand representing the call-preserved registers.
4445 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4446 assert(Mask && "Missing call preserved mask for calling convention");
4447 Ops.push_back(DAG.getRegisterMask(Mask));
4448
4449 if (SDValue Token = CLI.ConvergenceControlToken) {
4451 GlueOps.push_back(Token);
4452 if (InGlue)
4453 GlueOps.push_back(InGlue);
4454
4455 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4456 MVT::Glue, GlueOps),
4457 0);
4458 }
4459
4460 if (InGlue)
4461 Ops.push_back(InGlue);
4462
4463 // If we're doing a tall call, use a TC_RETURN here rather than an
4464 // actual call instruction.
4465 if (IsTailCall) {
4466 MFI.setHasTailCall();
4467 unsigned OPC = AMDGPUISD::TC_RETURN;
4468 switch (CallConv) {
4471 break;
4474 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4476 break;
4477 }
4478
4479 // If the caller is a whole wave function, we need to use a special opcode
4480 // so we can patch up EXEC.
4481 if (Info->isWholeWaveFunction())
4483
4484 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4485 }
4486
4487 // Returns a chain and a flag for retval copy to use.
4488 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4489 Chain = Call.getValue(0);
4490 InGlue = Call.getValue(1);
4491
4492 uint64_t CalleePopBytes = NumBytes;
4493 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4494 if (!Ins.empty())
4495 InGlue = Chain.getValue(1);
4496
4497 // Handle result values, copying them out of physregs into vregs that we
4498 // return.
4499 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4500 InVals, /*IsThisReturn=*/false, SDValue());
4501}
4502
4503// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4504// except for:
4505// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4506// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4508 SelectionDAG &DAG) const {
4509 const MachineFunction &MF = DAG.getMachineFunction();
4511
4512 SDLoc dl(Op);
4513 EVT VT = Op.getValueType();
4514 SDValue Chain = Op.getOperand(0);
4515 Register SPReg = Info->getStackPtrOffsetReg();
4516
4517 // Chain the dynamic stack allocation so that it doesn't modify the stack
4518 // pointer when other instructions are using the stack.
4519 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4520
4521 SDValue Size = Op.getOperand(1);
4522 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4523 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4524
4525 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4527 "Stack grows upwards for AMDGPU");
4528
4529 Chain = BaseAddr.getValue(1);
4530 Align StackAlign = TFL->getStackAlign();
4531 if (Alignment > StackAlign) {
4532 uint64_t ScaledAlignment = Alignment.value()
4533 << Subtarget->getWavefrontSizeLog2();
4534 uint64_t StackAlignMask = ScaledAlignment - 1;
4535 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4536 DAG.getConstant(StackAlignMask, dl, VT));
4537 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4538 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4539 }
4540
4541 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4542 SDValue NewSP;
4544 // For constant sized alloca, scale alloca size by wave-size
4545 SDValue ScaledSize = DAG.getNode(
4546 ISD::SHL, dl, VT, Size,
4547 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4548 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4549 } else {
4550 // For dynamic sized alloca, perform wave-wide reduction to get max of
4551 // alloca size(divergent) and then scale it by wave-size
4552 SDValue WaveReduction =
4553 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4554 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4555 Size, DAG.getConstant(0, dl, MVT::i32));
4556 SDValue ScaledSize = DAG.getNode(
4557 ISD::SHL, dl, VT, Size,
4558 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4559 NewSP =
4560 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4561 SDValue ReadFirstLaneID =
4562 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4563 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4564 NewSP);
4565 }
4566
4567 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4568 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4569
4570 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4571}
4572
4574 if (Op.getValueType() != MVT::i32)
4575 return Op; // Defer to cannot select error.
4576
4578 SDLoc SL(Op);
4579
4580 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4581
4582 // Convert from wave uniform to swizzled vector address. This should protect
4583 // from any edge cases where the stacksave result isn't directly used with
4584 // stackrestore.
4585 SDValue VectorAddress =
4586 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4587 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4588}
4589
4591 SelectionDAG &DAG) const {
4592 SDLoc SL(Op);
4593 assert(Op.getValueType() == MVT::i32);
4594
4595 uint32_t BothRoundHwReg =
4597 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4598
4599 SDValue IntrinID =
4600 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4601 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4602 Op.getOperand(0), IntrinID, GetRoundBothImm);
4603
4604 // There are two rounding modes, one for f32 and one for f64/f16. We only
4605 // report in the standard value range if both are the same.
4606 //
4607 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4608 // ties away from zero is not supported, and the other values are rotated by
4609 // 1.
4610 //
4611 // If the two rounding modes are not the same, report a target defined value.
4612
4613 // Mode register rounding mode fields:
4614 //
4615 // [1:0] Single-precision round mode.
4616 // [3:2] Double/Half-precision round mode.
4617 //
4618 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4619 //
4620 // Hardware Spec
4621 // Toward-0 3 0
4622 // Nearest Even 0 1
4623 // +Inf 1 2
4624 // -Inf 2 3
4625 // NearestAway0 N/A 4
4626 //
4627 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4628 // table we can index by the raw hardware mode.
4629 //
4630 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4631
4632 SDValue BitTable =
4634
4635 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4636 SDValue RoundModeTimesNumBits =
4637 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4638
4639 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4640 // knew only one mode was demanded.
4641 SDValue TableValue =
4642 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4643 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4644
4645 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4646 SDValue TableEntry =
4647 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4648
4649 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4650 // if it's an extended value.
4651 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4652 SDValue IsStandardValue =
4653 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4654 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4655 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4656 TableEntry, EnumOffset);
4657
4658 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4659}
4660
4662 SelectionDAG &DAG) const {
4663 SDLoc SL(Op);
4664
4665 SDValue NewMode = Op.getOperand(1);
4666 assert(NewMode.getValueType() == MVT::i32);
4667
4668 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4669 // hardware MODE.fp_round values.
4670 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4671 uint32_t ClampedVal = std::min(
4672 static_cast<uint32_t>(ConstMode->getZExtValue()),
4674 NewMode = DAG.getConstant(
4675 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4676 } else {
4677 // If we know the input can only be one of the supported standard modes in
4678 // the range 0-3, we can use a simplified mapping to hardware values.
4679 KnownBits KB = DAG.computeKnownBits(NewMode);
4680 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4681 // The supported standard values are 0-3. The extended values start at 8. We
4682 // need to offset by 4 if the value is in the extended range.
4683
4684 if (UseReducedTable) {
4685 // Truncate to the low 32-bits.
4686 SDValue BitTable = DAG.getConstant(
4687 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4688
4689 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4690 SDValue RoundModeTimesNumBits =
4691 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4692
4693 NewMode =
4694 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4695
4696 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4697 // the table extracted bits into inline immediates.
4698 } else {
4699 // table_index = umin(value, value - 4)
4700 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4701 SDValue BitTable =
4703
4704 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4705 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4706 SDValue IndexVal =
4707 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4708
4709 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4710 SDValue RoundModeTimesNumBits =
4711 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4712
4713 SDValue TableValue =
4714 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4715 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4716
4717 // No need to mask out the high bits since the setreg will ignore them
4718 // anyway.
4719 NewMode = TruncTable;
4720 }
4721
4722 // Insert a readfirstlane in case the value is a VGPR. We could do this
4723 // earlier and keep more operations scalar, but that interferes with
4724 // combining the source.
4725 SDValue ReadFirstLaneID =
4726 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4727 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4728 ReadFirstLaneID, NewMode);
4729 }
4730
4731 // N.B. The setreg will be later folded into s_round_mode on supported
4732 // targets.
4733 SDValue IntrinID =
4734 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4735 uint32_t BothRoundHwReg =
4737 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4738
4739 SDValue SetReg =
4740 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4741 IntrinID, RoundBothImm, NewMode);
4742
4743 return SetReg;
4744}
4745
4747 if (Op->isDivergent() &&
4748 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4749 // Cannot do I$ prefetch with divergent pointer.
4750 return SDValue();
4751
4752 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4756 break;
4758 if (Subtarget->hasSafeSmemPrefetch())
4759 break;
4760 [[fallthrough]];
4761 default:
4762 return SDValue();
4763 }
4764
4765 // I$ prefetch
4766 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4767 return SDValue();
4768
4769 return Op;
4770}
4771
4772// Work around DAG legality rules only based on the result type.
4774 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4775 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4776 EVT SrcVT = Src.getValueType();
4777
4778 if (SrcVT.getScalarType() != MVT::bf16)
4779 return Op;
4780
4781 SDLoc SL(Op);
4782 SDValue BitCast =
4783 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4784
4785 EVT DstVT = Op.getValueType();
4786 if (IsStrict)
4787 llvm_unreachable("Need STRICT_BF16_TO_FP");
4788
4789 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4790}
4791
4793 SDLoc SL(Op);
4794 if (Op.getValueType() != MVT::i64)
4795 return Op;
4796
4797 uint32_t ModeHwReg =
4799 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4800 uint32_t TrapHwReg =
4802 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4803
4804 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4805 SDValue IntrinID =
4806 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4807 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4808 Op.getOperand(0), IntrinID, ModeHwRegImm);
4809 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4810 Op.getOperand(0), IntrinID, TrapHwRegImm);
4811 SDValue TokenReg =
4812 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4813 GetTrapReg.getValue(1));
4814
4815 SDValue CvtPtr =
4816 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4817 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4818
4819 return DAG.getMergeValues({Result, TokenReg}, SL);
4820}
4821
4823 SDLoc SL(Op);
4824 if (Op.getOperand(1).getValueType() != MVT::i64)
4825 return Op;
4826
4827 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4828 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4829 DAG.getConstant(0, SL, MVT::i32));
4830 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4831 DAG.getConstant(1, SL, MVT::i32));
4832
4833 SDValue ReadFirstLaneID =
4834 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4835 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4836 ReadFirstLaneID, NewModeReg);
4837 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4838 ReadFirstLaneID, NewTrapReg);
4839
4840 unsigned ModeHwReg =
4842 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4843 unsigned TrapHwReg =
4845 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4846
4847 SDValue IntrinID =
4848 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4849 SDValue SetModeReg =
4850 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4851 IntrinID, ModeHwRegImm, NewModeReg);
4852 SDValue SetTrapReg =
4853 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4854 IntrinID, TrapHwRegImm, NewTrapReg);
4855 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4856}
4857
4859 const MachineFunction &MF) const {
4860 const Function &Fn = MF.getFunction();
4861
4863 .Case("m0", AMDGPU::M0)
4864 .Case("exec", AMDGPU::EXEC)
4865 .Case("exec_lo", AMDGPU::EXEC_LO)
4866 .Case("exec_hi", AMDGPU::EXEC_HI)
4867 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4868 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4869 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4870 .Default(Register());
4871 if (!Reg)
4872 return Reg;
4873
4874 if (!Subtarget->hasFlatScrRegister() &&
4875 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4876 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
4877 "\" for subtarget."));
4878 }
4879
4880 switch (Reg) {
4881 case AMDGPU::M0:
4882 case AMDGPU::EXEC_LO:
4883 case AMDGPU::EXEC_HI:
4884 case AMDGPU::FLAT_SCR_LO:
4885 case AMDGPU::FLAT_SCR_HI:
4886 if (VT.getSizeInBits() == 32)
4887 return Reg;
4888 break;
4889 case AMDGPU::EXEC:
4890 case AMDGPU::FLAT_SCR:
4891 if (VT.getSizeInBits() == 64)
4892 return Reg;
4893 break;
4894 default:
4895 llvm_unreachable("missing register type checking");
4896 }
4897
4899 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
4900}
4901
4902// If kill is not the last instruction, split the block so kill is always a
4903// proper terminator.
4906 MachineBasicBlock *BB) const {
4907 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
4909 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4910 return SplitBB;
4911}
4912
4913// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4914// \p MI will be the only instruction in the loop body block. Otherwise, it will
4915// be the first instruction in the remainder block.
4916//
4917/// \returns { LoopBody, Remainder }
4918static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4920 MachineFunction *MF = MBB.getParent();
4922
4923 // To insert the loop we need to split the block. Move everything after this
4924 // point to a new block, and insert a new empty block between the two.
4926 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4928 ++MBBI;
4929
4930 MF->insert(MBBI, LoopBB);
4931 MF->insert(MBBI, RemainderBB);
4932
4933 LoopBB->addSuccessor(LoopBB);
4934 LoopBB->addSuccessor(RemainderBB);
4935
4936 // Move the rest of the block into a new block.
4937 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4938
4939 if (InstInLoop) {
4940 auto Next = std::next(I);
4941
4942 // Move instruction to loop body.
4943 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4944
4945 // Move the rest of the block.
4946 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4947 } else {
4948 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4949 }
4950
4951 MBB.addSuccessor(LoopBB);
4952
4953 return std::pair(LoopBB, RemainderBB);
4954}
4955
4956/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4958 MachineBasicBlock *MBB = MI.getParent();
4960 auto I = MI.getIterator();
4961 auto E = std::next(I);
4962
4963 // clang-format off
4964 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4965 .addImm(0);
4966 // clang-format on
4967
4968 MIBundleBuilder Bundler(*MBB, I, E);
4969 finalizeBundle(*MBB, Bundler.begin());
4970}
4971
4974 MachineBasicBlock *BB) const {
4975 const DebugLoc &DL = MI.getDebugLoc();
4976
4978
4980
4981 // Apparently kill flags are only valid if the def is in the same block?
4982 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4983 Src->setIsKill(false);
4984
4985 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
4986
4987 MachineBasicBlock::iterator I = LoopBB->end();
4988
4989 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4991
4992 // Clear TRAP_STS.MEM_VIOL
4993 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4994 .addImm(0)
4995 .addImm(EncodedReg);
4996
4998
4999 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5000
5001 // Load and check TRAP_STS.MEM_VIOL
5002 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5003 .addImm(EncodedReg);
5004
5005 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5006 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5007 .addReg(Reg, RegState::Kill)
5008 .addImm(0);
5009 // clang-format off
5010 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5011 .addMBB(LoopBB);
5012 // clang-format on
5013
5014 return RemainderBB;
5015}
5016
5017// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5018// wavefront. If the value is uniform and just happens to be in a VGPR, this
5019// will only do one iteration. In the worst case, this will loop 64 times.
5020//
5021// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5024 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5025 const DebugLoc &DL, const MachineOperand &Idx,
5026 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5027 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5028 Register &SGPRIdxReg) {
5029
5030 MachineFunction *MF = OrigBB.getParent();
5031 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5032 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5035
5036 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5037 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5038 Register NewExec = MRI.createVirtualRegister(BoolRC);
5039 Register CurrentIdxReg =
5040 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5041 Register CondReg = MRI.createVirtualRegister(BoolRC);
5042
5043 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5044 .addReg(InitReg)
5045 .addMBB(&OrigBB)
5046 .addReg(ResultReg)
5047 .addMBB(&LoopBB);
5048
5049 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5050 .addReg(InitSaveExecReg)
5051 .addMBB(&OrigBB)
5052 .addReg(NewExec)
5053 .addMBB(&LoopBB);
5054
5055 // Read the next variant <- also loop target.
5056 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5057 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5058
5059 // Compare the just read M0 value to all possible Idx values.
5060 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5061 .addReg(CurrentIdxReg)
5062 .addReg(Idx.getReg(), 0, Idx.getSubReg());
5063
5064 // Update EXEC, save the original EXEC value to VCC.
5065 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5066 .addReg(CondReg, RegState::Kill);
5067
5068 MRI.setSimpleHint(NewExec, CondReg);
5069
5070 if (UseGPRIdxMode) {
5071 if (Offset == 0) {
5072 SGPRIdxReg = CurrentIdxReg;
5073 } else {
5074 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5075 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5076 .addReg(CurrentIdxReg, RegState::Kill)
5077 .addImm(Offset);
5078 }
5079 } else {
5080 // Move index from VCC into M0
5081 if (Offset == 0) {
5082 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5083 .addReg(CurrentIdxReg, RegState::Kill);
5084 } else {
5085 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5086 .addReg(CurrentIdxReg, RegState::Kill)
5087 .addImm(Offset);
5088 }
5089 }
5090
5091 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5092 MachineInstr *InsertPt =
5093 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5094 .addReg(LMC.ExecReg)
5095 .addReg(NewExec);
5096
5097 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5098 // s_cbranch_scc0?
5099
5100 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5101 // clang-format off
5102 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5103 .addMBB(&LoopBB);
5104 // clang-format on
5105
5106 return InsertPt->getIterator();
5107}
5108
5109// This has slightly sub-optimal regalloc when the source vector is killed by
5110// the read. The register allocator does not understand that the kill is
5111// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5112// subregister from it, using 1 more VGPR than necessary. This was saved when
5113// this was expanded after register allocation.
5116 unsigned InitResultReg, unsigned PhiReg, int Offset,
5117 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5118 MachineFunction *MF = MBB.getParent();
5119 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5120 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5122 const DebugLoc &DL = MI.getDebugLoc();
5124
5125 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5126 Register DstReg = MI.getOperand(0).getReg();
5127 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5128 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5130
5131 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5132
5133 // Save the EXEC mask
5134 // clang-format off
5135 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5136 .addReg(LMC.ExecReg);
5137 // clang-format on
5138
5139 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5140
5141 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5142
5143 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5144 InitResultReg, DstReg, PhiReg, TmpExec,
5145 Offset, UseGPRIdxMode, SGPRIdxReg);
5146
5147 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5149 ++MBBI;
5150 MF->insert(MBBI, LandingPad);
5151 LoopBB->removeSuccessor(RemainderBB);
5152 LandingPad->addSuccessor(RemainderBB);
5153 LoopBB->addSuccessor(LandingPad);
5154 MachineBasicBlock::iterator First = LandingPad->begin();
5155 // clang-format off
5156 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5157 .addReg(SaveExec);
5158 // clang-format on
5159
5160 return InsPt;
5161}
5162
5163// Returns subreg index, offset
5164static std::pair<unsigned, int>
5166 const TargetRegisterClass *SuperRC, unsigned VecReg,
5167 int Offset) {
5168 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5169
5170 // Skip out of bounds offsets, or else we would end up using an undefined
5171 // register.
5172 if (Offset >= NumElts || Offset < 0)
5173 return std::pair(AMDGPU::sub0, Offset);
5174
5175 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5176}
5177
5180 int Offset) {
5181 MachineBasicBlock *MBB = MI.getParent();
5182 const DebugLoc &DL = MI.getDebugLoc();
5184
5185 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5186
5187 assert(Idx->getReg() != AMDGPU::NoRegister);
5188
5189 if (Offset == 0) {
5190 // clang-format off
5191 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5192 .add(*Idx);
5193 // clang-format on
5194 } else {
5195 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5196 .add(*Idx)
5197 .addImm(Offset);
5198 }
5199}
5200
5203 int Offset) {
5204 MachineBasicBlock *MBB = MI.getParent();
5205 const DebugLoc &DL = MI.getDebugLoc();
5207
5208 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5209
5210 if (Offset == 0)
5211 return Idx->getReg();
5212
5213 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5214 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5215 .add(*Idx)
5216 .addImm(Offset);
5217 return Tmp;
5218}
5219
5222 const GCNSubtarget &ST) {
5223 const SIInstrInfo *TII = ST.getInstrInfo();
5224 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5225 MachineFunction *MF = MBB.getParent();
5227
5228 Register Dst = MI.getOperand(0).getReg();
5229 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5230 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5231 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5232
5233 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5234 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5235
5236 unsigned SubReg;
5237 std::tie(SubReg, Offset) =
5238 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5239
5240 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5241
5242 // Check for a SGPR index.
5243 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5245 const DebugLoc &DL = MI.getDebugLoc();
5246
5247 if (UseGPRIdxMode) {
5248 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5249 // to avoid interfering with other uses, so probably requires a new
5250 // optimization pass.
5252
5253 const MCInstrDesc &GPRIDXDesc =
5254 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5255 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5256 .addReg(SrcReg)
5257 .addReg(Idx)
5258 .addImm(SubReg);
5259 } else {
5261
5262 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5263 .addReg(SrcReg, 0, SubReg)
5264 .addReg(SrcReg, RegState::Implicit);
5265 }
5266
5267 MI.eraseFromParent();
5268
5269 return &MBB;
5270 }
5271
5272 // Control flow needs to be inserted if indexing with a VGPR.
5273 const DebugLoc &DL = MI.getDebugLoc();
5275
5276 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5277 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5278
5279 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5280
5281 Register SGPRIdxReg;
5282 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5283 UseGPRIdxMode, SGPRIdxReg);
5284
5285 MachineBasicBlock *LoopBB = InsPt->getParent();
5286
5287 if (UseGPRIdxMode) {
5288 const MCInstrDesc &GPRIDXDesc =
5289 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5290
5291 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5292 .addReg(SrcReg)
5293 .addReg(SGPRIdxReg)
5294 .addImm(SubReg);
5295 } else {
5296 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5297 .addReg(SrcReg, 0, SubReg)
5298 .addReg(SrcReg, RegState::Implicit);
5299 }
5300
5301 MI.eraseFromParent();
5302
5303 return LoopBB;
5304}
5305
5308 const GCNSubtarget &ST) {
5309 const SIInstrInfo *TII = ST.getInstrInfo();
5310 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5311 MachineFunction *MF = MBB.getParent();
5313
5314 Register Dst = MI.getOperand(0).getReg();
5315 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5316 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5317 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5318 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5319 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5320 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5321
5322 // This can be an immediate, but will be folded later.
5323 assert(Val->getReg());
5324
5325 unsigned SubReg;
5326 std::tie(SubReg, Offset) =
5327 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5328 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5329
5330 if (Idx->getReg() == AMDGPU::NoRegister) {
5332 const DebugLoc &DL = MI.getDebugLoc();
5333
5334 assert(Offset == 0);
5335
5336 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5337 .add(*SrcVec)
5338 .add(*Val)
5339 .addImm(SubReg);
5340
5341 MI.eraseFromParent();
5342 return &MBB;
5343 }
5344
5345 // Check for a SGPR index.
5346 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5348 const DebugLoc &DL = MI.getDebugLoc();
5349
5350 if (UseGPRIdxMode) {
5352
5353 const MCInstrDesc &GPRIDXDesc =
5354 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5355 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5356 .addReg(SrcVec->getReg())
5357 .add(*Val)
5358 .addReg(Idx)
5359 .addImm(SubReg);
5360 } else {
5362
5363 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5364 TRI.getRegSizeInBits(*VecRC), 32, false);
5365 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5366 .addReg(SrcVec->getReg())
5367 .add(*Val)
5368 .addImm(SubReg);
5369 }
5370 MI.eraseFromParent();
5371 return &MBB;
5372 }
5373
5374 // Control flow needs to be inserted if indexing with a VGPR.
5375 if (Val->isReg())
5376 MRI.clearKillFlags(Val->getReg());
5377
5378 const DebugLoc &DL = MI.getDebugLoc();
5379
5380 Register PhiReg = MRI.createVirtualRegister(VecRC);
5381
5382 Register SGPRIdxReg;
5383 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5384 UseGPRIdxMode, SGPRIdxReg);
5385 MachineBasicBlock *LoopBB = InsPt->getParent();
5386
5387 if (UseGPRIdxMode) {
5388 const MCInstrDesc &GPRIDXDesc =
5389 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5390
5391 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5392 .addReg(PhiReg)
5393 .add(*Val)
5394 .addReg(SGPRIdxReg)
5395 .addImm(SubReg);
5396 } else {
5397 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5398 TRI.getRegSizeInBits(*VecRC), 32, false);
5399 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5400 .addReg(PhiReg)
5401 .add(*Val)
5402 .addImm(SubReg);
5403 }
5404
5405 MI.eraseFromParent();
5406 return LoopBB;
5407}
5408
5410 MachineBasicBlock *BB) {
5411 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5412 // For GFX12, we emit s_add_u64 and s_sub_u64.
5413 MachineFunction *MF = BB->getParent();
5414 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5415 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5417 const DebugLoc &DL = MI.getDebugLoc();
5418 MachineOperand &Dest = MI.getOperand(0);
5419 MachineOperand &Src0 = MI.getOperand(1);
5420 MachineOperand &Src1 = MI.getOperand(2);
5421 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5422 if (ST.hasScalarAddSub64()) {
5423 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5424 // clang-format off
5425 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5426 .add(Src0)
5427 .add(Src1);
5428 // clang-format on
5429 } else {
5430 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5431 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5432
5433 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5434 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5435
5436 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5437 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5438 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5439 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5440
5441 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5442 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5443 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5444 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5445
5446 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5447 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5448 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5449 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5450 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5451 .addReg(DestSub0)
5452 .addImm(AMDGPU::sub0)
5453 .addReg(DestSub1)
5454 .addImm(AMDGPU::sub1);
5455 }
5456 MI.eraseFromParent();
5457 return BB;
5458}
5459
5461 switch (Opc) {
5462 case AMDGPU::S_MIN_U32:
5463 return std::numeric_limits<uint32_t>::max();
5464 case AMDGPU::S_MIN_I32:
5465 return std::numeric_limits<int32_t>::max();
5466 case AMDGPU::S_MAX_U32:
5467 return std::numeric_limits<uint32_t>::min();
5468 case AMDGPU::S_MAX_I32:
5469 return std::numeric_limits<int32_t>::min();
5470 case AMDGPU::S_ADD_I32:
5471 case AMDGPU::S_SUB_I32:
5472 case AMDGPU::S_OR_B32:
5473 case AMDGPU::S_XOR_B32:
5474 return std::numeric_limits<uint32_t>::min();
5475 case AMDGPU::S_AND_B32:
5476 return std::numeric_limits<uint32_t>::max();
5477 default:
5479 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5480 }
5481}
5482
5484 switch (Opc) {
5485 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5486 return std::numeric_limits<uint64_t>::max();
5487 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5488 return std::numeric_limits<int64_t>::max();
5489 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5490 return std::numeric_limits<uint64_t>::min();
5491 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5492 return std::numeric_limits<int64_t>::min();
5493 case AMDGPU::S_ADD_U64_PSEUDO:
5494 case AMDGPU::S_SUB_U64_PSEUDO:
5495 case AMDGPU::S_OR_B64:
5496 case AMDGPU::S_XOR_B64:
5497 return std::numeric_limits<uint64_t>::min();
5498 case AMDGPU::S_AND_B64:
5499 return std::numeric_limits<uint64_t>::max();
5500 default:
5502 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5503 }
5504}
5505
5506static bool is32bitWaveReduceOperation(unsigned Opc) {
5507 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5508 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5509 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5510 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5511 Opc == AMDGPU::S_XOR_B32;
5512}
5513
5516 const GCNSubtarget &ST,
5517 unsigned Opc) {
5519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5520 const DebugLoc &DL = MI.getDebugLoc();
5521 const SIInstrInfo *TII = ST.getInstrInfo();
5522
5523 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5524 Register SrcReg = MI.getOperand(1).getReg();
5525 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5526 Register DstReg = MI.getOperand(0).getReg();
5527 MachineBasicBlock *RetBB = nullptr;
5528 if (isSGPR) {
5529 switch (Opc) {
5530 case AMDGPU::S_MIN_U32:
5531 case AMDGPU::S_MIN_I32:
5532 case AMDGPU::S_MAX_U32:
5533 case AMDGPU::S_MAX_I32:
5534 case AMDGPU::S_AND_B32:
5535 case AMDGPU::S_OR_B32: {
5536 // Idempotent operations.
5537 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5538 RetBB = &BB;
5539 break;
5540 }
5541 case AMDGPU::V_CMP_LT_U64_e64: // umin
5542 case AMDGPU::V_CMP_LT_I64_e64: // min
5543 case AMDGPU::V_CMP_GT_U64_e64: // umax
5544 case AMDGPU::V_CMP_GT_I64_e64: // max
5545 case AMDGPU::S_AND_B64:
5546 case AMDGPU::S_OR_B64: {
5547 // Idempotent operations.
5548 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5549 RetBB = &BB;
5550 break;
5551 }
5552 case AMDGPU::S_XOR_B32:
5553 case AMDGPU::S_XOR_B64:
5554 case AMDGPU::S_ADD_I32:
5555 case AMDGPU::S_ADD_U64_PSEUDO:
5556 case AMDGPU::S_SUB_I32:
5557 case AMDGPU::S_SUB_U64_PSEUDO: {
5558 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5559 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5560 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5561 Register NumActiveLanes =
5562 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563
5564 bool IsWave32 = ST.isWave32();
5565 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5566 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5567 unsigned BitCountOpc =
5568 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5569
5570 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5571
5572 auto NewAccumulator =
5573 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5574 .addReg(ExecMask);
5575
5576 switch (Opc) {
5577 case AMDGPU::S_XOR_B32:
5578 case AMDGPU::S_XOR_B64: {
5579 // Performing an XOR operation on a uniform value
5580 // depends on the parity of the number of active lanes.
5581 // For even parity, the result will be 0, for odd
5582 // parity the result will be the same as the input value.
5583 Register ParityRegister =
5584 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5585
5586 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5587 .addReg(NewAccumulator->getOperand(0).getReg())
5588 .addImm(1)
5589 .setOperandDead(3); // Dead scc
5590 if (Opc == AMDGPU::S_XOR_B32) {
5591 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5592 .addReg(SrcReg)
5593 .addReg(ParityRegister);
5594 } else {
5595 Register DestSub0 =
5596 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597 Register DestSub1 =
5598 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5599
5600 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5601 const TargetRegisterClass *SrcSubRC =
5602 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5603
5604 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5605 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5606 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5607 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5608
5609 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5610 .add(Op1L)
5611 .addReg(ParityRegister);
5612
5613 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5614 .add(Op1H)
5615 .addReg(ParityRegister);
5616
5617 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5618 .addReg(DestSub0)
5619 .addImm(AMDGPU::sub0)
5620 .addReg(DestSub1)
5621 .addImm(AMDGPU::sub1);
5622 }
5623 break;
5624 }
5625 case AMDGPU::S_SUB_I32: {
5626 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5627
5628 // Take the negation of the source operand.
5629 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5630 .addImm(0)
5631 .addReg(SrcReg);
5632 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5633 .addReg(NegatedVal)
5634 .addReg(NewAccumulator->getOperand(0).getReg());
5635 break;
5636 }
5637 case AMDGPU::S_ADD_I32: {
5638 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5639 .addReg(SrcReg)
5640 .addReg(NewAccumulator->getOperand(0).getReg());
5641 break;
5642 }
5643 case AMDGPU::S_ADD_U64_PSEUDO:
5644 case AMDGPU::S_SUB_U64_PSEUDO: {
5645 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5646 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5647 Register Op1H_Op0L_Reg =
5648 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 Register Op1L_Op0H_Reg =
5650 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5651 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register NegatedValLo =
5654 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5655 Register NegatedValHi =
5656 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5657
5658 const TargetRegisterClass *Src1RC = MRI.getRegClass(SrcReg);
5659 const TargetRegisterClass *Src1SubRC =
5660 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5661
5662 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5663 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5664 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5665 MI, MRI, MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5666
5667 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5668 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5669 .addImm(0)
5670 .addReg(NewAccumulator->getOperand(0).getReg())
5671 .setOperandDead(3); // Dead scc
5672 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5673 .addReg(NegatedValLo)
5674 .addImm(31)
5675 .setOperandDead(3); // Dead scc
5676 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5677 .add(Op1L)
5678 .addReg(NegatedValHi);
5679 }
5680 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5681 ? NegatedValLo
5682 : NewAccumulator->getOperand(0).getReg();
5683 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5684 .add(Op1L)
5685 .addReg(LowOpcode);
5686 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5687 .add(Op1L)
5688 .addReg(LowOpcode);
5689 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5690 .add(Op1H)
5691 .addReg(LowOpcode);
5692
5693 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5694 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5695 .addReg(CarryReg)
5696 .addReg(Op1H_Op0L_Reg)
5697 .setOperandDead(3); // Dead scc
5698
5699 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5700 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
5701 .addReg(HiVal)
5702 .addReg(Op1L_Op0H_Reg)
5703 .setOperandDead(3); // Dead scc
5704 }
5705 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5706 .addReg(DestSub0)
5707 .addImm(AMDGPU::sub0)
5708 .addReg(DestSub1)
5709 .addImm(AMDGPU::sub1);
5710 break;
5711 }
5712 }
5713 RetBB = &BB;
5714 }
5715 }
5716 } else {
5717 // TODO: Implement DPP Strategy and switch based on immediate strategy
5718 // operand. For now, for all the cases (default, Iterative and DPP we use
5719 // iterative approach by default.)
5720
5721 // To reduce the VGPR using iterative approach, we need to iterate
5722 // over all the active lanes. Lowering consists of ComputeLoop,
5723 // which iterate over only active lanes. We use copy of EXEC register
5724 // as induction variable and every active lane modifies it using bitset0
5725 // so that we will get the next active lane for next iteration.
5727 Register SrcReg = MI.getOperand(1).getReg();
5728 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
5729
5730 // Create Control flow for loop
5731 // Split MI's Machine Basic block into For loop
5732 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
5733
5734 // Create virtual registers required for lowering.
5735 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5736 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5737 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
5738 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
5739 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
5740 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5741 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
5742 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5743 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
5744
5745 bool IsWave32 = ST.isWave32();
5746 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5747 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5748
5749 // Create initial values of induction variable from Exec, Accumulator and
5750 // insert branch instr to newly created ComputeBlock
5751 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
5752 if (is32BitOpc) {
5754 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5755 .addImm(IdentityValue);
5756 } else {
5758 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5759 .addImm(IdentityValue);
5760 }
5761 // clang-format off
5762 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
5763 .addMBB(ComputeLoop);
5764 // clang-format on
5765
5766 // Start constructing ComputeLoop
5767 I = ComputeLoop->begin();
5768 auto Accumulator =
5769 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
5770 .addReg(IdentityValReg)
5771 .addMBB(&BB);
5772 auto ActiveBits =
5773 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
5774 .addReg(LoopIterator)
5775 .addMBB(&BB);
5776
5777 I = ComputeLoop->end();
5778 MachineInstr *NewAccumulator;
5779 // Perform the computations
5780 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5781 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
5782 .addReg(ActiveBitsReg);
5783 if (is32BitOpc) {
5784 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5785 LaneValueReg)
5786 .addReg(SrcReg)
5787 .addReg(FF1Reg);
5788 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5789 .addReg(Accumulator->getOperand(0).getReg())
5790 .addReg(LaneValueReg);
5791 } else {
5792 Register LaneValueLoReg =
5793 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5794 Register LaneValueHiReg =
5795 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5796 Register LaneValReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5797 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
5798 const TargetRegisterClass *SrcSubRC =
5799 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5800 MachineOperand Op1L = TII->buildExtractSubRegOrImm(
5801 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5802 MachineOperand Op1H = TII->buildExtractSubRegOrImm(
5803 MI, MRI, MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5804 // lane value input should be in an sgpr
5805 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5806 LaneValueLoReg)
5807 .add(Op1L)
5808 .addReg(FF1Reg);
5809 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
5810 LaneValueHiReg)
5811 .add(Op1H)
5812 .addReg(FF1Reg);
5813 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
5814 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5815 .addReg(LaneValueLoReg)
5816 .addImm(AMDGPU::sub0)
5817 .addReg(LaneValueHiReg)
5818 .addImm(AMDGPU::sub1);
5819 switch (Opc) {
5820 case AMDGPU::S_OR_B64:
5821 case AMDGPU::S_AND_B64:
5822 case AMDGPU::S_XOR_B64: {
5823 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5824 .addReg(Accumulator->getOperand(0).getReg())
5825 .addReg(LaneValue->getOperand(0).getReg())
5826 .setOperandDead(3); // Dead scc
5827 break;
5828 }
5829 case AMDGPU::V_CMP_GT_I64_e64:
5830 case AMDGPU::V_CMP_GT_U64_e64:
5831 case AMDGPU::V_CMP_LT_I64_e64:
5832 case AMDGPU::V_CMP_LT_U64_e64: {
5833 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
5834 Register ComparisonResultReg =
5835 MRI.createVirtualRegister(WaveMaskRegClass);
5836 const TargetRegisterClass *VregClass = TRI->getVGPR64Class();
5837 const TargetRegisterClass *VSubRegClass =
5838 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5839 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
5840 MachineOperand SrcReg0Sub0 =
5841 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5842 VregClass, AMDGPU::sub0, VSubRegClass);
5843 MachineOperand SrcReg0Sub1 =
5844 TII->buildExtractSubRegOrImm(MI, MRI, Accumulator->getOperand(0),
5845 VregClass, AMDGPU::sub1, VSubRegClass);
5846 BuildMI(*ComputeLoop, I, DL, TII->get(TargetOpcode::REG_SEQUENCE),
5847 AccumulatorVReg)
5848 .add(SrcReg0Sub0)
5849 .addImm(AMDGPU::sub0)
5850 .add(SrcReg0Sub1)
5851 .addImm(AMDGPU::sub1);
5852 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
5853 .addReg(LaneValue->getOperand(0).getReg())
5854 .addReg(AccumulatorVReg);
5855
5856 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5857 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
5858 .addReg(LaneMaskReg)
5859 .addReg(ActiveBitsReg);
5860
5861 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
5862 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5863 .addReg(LaneValue->getOperand(0).getReg())
5864 .addReg(Accumulator->getOperand(0).getReg());
5865 break;
5866 }
5867 case AMDGPU::S_ADD_U64_PSEUDO:
5868 case AMDGPU::S_SUB_U64_PSEUDO: {
5869 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
5870 .addReg(Accumulator->getOperand(0).getReg())
5871 .addReg(LaneValue->getOperand(0).getReg());
5872 ComputeLoop = Expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
5873 break;
5874 }
5875 }
5876 }
5877 // Manipulate the iterator to get the next active lane
5878 unsigned BITSETOpc =
5879 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5880 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
5881 .addReg(FF1Reg)
5882 .addReg(ActiveBitsReg);
5883
5884 // Add phi nodes
5885 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
5886 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5887
5888 // Creating branching
5889 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5890 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
5891 .addReg(NewActiveBitsReg)
5892 .addImm(0);
5893 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5894 .addMBB(ComputeLoop);
5895
5896 RetBB = ComputeEnd;
5897 }
5898 MI.eraseFromParent();
5899 return RetBB;
5900}
5901
5904 MachineBasicBlock *BB) const {
5905 MachineFunction *MF = BB->getParent();
5907 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5909 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
5911 const DebugLoc &DL = MI.getDebugLoc();
5912
5913 switch (MI.getOpcode()) {
5914 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5915 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
5916 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5917 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
5918 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5919 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
5920 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5921 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
5922 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5923 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
5924 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5925 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
5926 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5927 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
5928 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5929 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
5930 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5931 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
5932 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5933 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
5934 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5935 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
5936 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5937 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
5938 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5939 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
5940 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5941 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
5942 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5943 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
5944 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5945 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
5946 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5947 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
5948 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5949 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
5950 case AMDGPU::S_UADDO_PSEUDO:
5951 case AMDGPU::S_USUBO_PSEUDO: {
5952 MachineOperand &Dest0 = MI.getOperand(0);
5953 MachineOperand &Dest1 = MI.getOperand(1);
5954 MachineOperand &Src0 = MI.getOperand(2);
5955 MachineOperand &Src1 = MI.getOperand(3);
5956
5957 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5958 ? AMDGPU::S_ADD_U32
5959 : AMDGPU::S_SUB_U32;
5960 // clang-format off
5961 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
5962 .add(Src0)
5963 .add(Src1);
5964 // clang-format on
5965
5966 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
5967 .addImm(1)
5968 .addImm(0);
5969
5970 MI.eraseFromParent();
5971 return BB;
5972 }
5973 case AMDGPU::S_ADD_U64_PSEUDO:
5974 case AMDGPU::S_SUB_U64_PSEUDO: {
5975 return Expand64BitScalarArithmetic(MI, BB);
5976 }
5977 case AMDGPU::V_ADD_U64_PSEUDO:
5978 case AMDGPU::V_SUB_U64_PSEUDO: {
5979 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5980
5981 MachineOperand &Dest = MI.getOperand(0);
5982 MachineOperand &Src0 = MI.getOperand(1);
5983 MachineOperand &Src1 = MI.getOperand(2);
5984
5985 if (ST.hasAddSubU64Insts()) {
5986 auto I = BuildMI(*BB, MI, DL,
5987 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5988 : AMDGPU::V_SUB_U64_e64),
5989 Dest.getReg())
5990 .add(Src0)
5991 .add(Src1)
5992 .addImm(0); // clamp
5993 TII->legalizeOperands(*I);
5994 MI.eraseFromParent();
5995 return BB;
5996 }
5997
5998 if (IsAdd && ST.hasLshlAddU64Inst()) {
5999 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6000 Dest.getReg())
6001 .add(Src0)
6002 .addImm(0)
6003 .add(Src1);
6004 TII->legalizeOperands(*Add);
6005 MI.eraseFromParent();
6006 return BB;
6007 }
6008
6009 const auto *CarryRC = TRI->getWaveMaskRegClass();
6010
6011 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6012 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6013
6014 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6015 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6016
6017 const TargetRegisterClass *Src0RC = Src0.isReg()
6018 ? MRI.getRegClass(Src0.getReg())
6019 : &AMDGPU::VReg_64RegClass;
6020 const TargetRegisterClass *Src1RC = Src1.isReg()
6021 ? MRI.getRegClass(Src1.getReg())
6022 : &AMDGPU::VReg_64RegClass;
6023
6024 const TargetRegisterClass *Src0SubRC =
6025 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6026 const TargetRegisterClass *Src1SubRC =
6027 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6028
6029 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6030 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6031 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6032 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6033
6034 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6035 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6036 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6037 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6038
6039 unsigned LoOpc =
6040 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6041 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6042 .addReg(CarryReg, RegState::Define)
6043 .add(SrcReg0Sub0)
6044 .add(SrcReg1Sub0)
6045 .addImm(0); // clamp bit
6046
6047 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6048 MachineInstr *HiHalf =
6049 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6050 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6051 .add(SrcReg0Sub1)
6052 .add(SrcReg1Sub1)
6053 .addReg(CarryReg, RegState::Kill)
6054 .addImm(0); // clamp bit
6055
6056 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6057 .addReg(DestSub0)
6058 .addImm(AMDGPU::sub0)
6059 .addReg(DestSub1)
6060 .addImm(AMDGPU::sub1);
6061 TII->legalizeOperands(*LoHalf);
6062 TII->legalizeOperands(*HiHalf);
6063 MI.eraseFromParent();
6064 return BB;
6065 }
6066 case AMDGPU::S_ADD_CO_PSEUDO:
6067 case AMDGPU::S_SUB_CO_PSEUDO: {
6068 // This pseudo has a chance to be selected
6069 // only from uniform add/subcarry node. All the VGPR operands
6070 // therefore assumed to be splat vectors.
6072 MachineOperand &Dest = MI.getOperand(0);
6073 MachineOperand &CarryDest = MI.getOperand(1);
6074 MachineOperand &Src0 = MI.getOperand(2);
6075 MachineOperand &Src1 = MI.getOperand(3);
6076 MachineOperand &Src2 = MI.getOperand(4);
6077 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
6078 ? AMDGPU::S_ADDC_U32
6079 : AMDGPU::S_SUBB_U32;
6080 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6081 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6082 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6083 .addReg(Src0.getReg());
6084 Src0.setReg(RegOp0);
6085 }
6086 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6087 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6088 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6089 .addReg(Src1.getReg());
6090 Src1.setReg(RegOp1);
6091 }
6092 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6094 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6095 .addReg(Src2.getReg());
6096 Src2.setReg(RegOp2);
6097 }
6098
6099 if (ST.isWave64()) {
6100 if (ST.hasScalarCompareEq64()) {
6101 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6102 .addReg(Src2.getReg())
6103 .addImm(0);
6104 } else {
6105 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6106 const TargetRegisterClass *SubRC =
6107 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6108 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6109 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6110 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6111 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6112 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6113
6114 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6115 .add(Src2Sub0)
6116 .add(Src2Sub1);
6117
6118 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6119 .addReg(Src2_32, RegState::Kill)
6120 .addImm(0);
6121 }
6122 } else {
6123 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6124 .addReg(Src2.getReg())
6125 .addImm(0);
6126 }
6127
6128 // clang-format off
6129 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg())
6130 .add(Src0)
6131 .add(Src1);
6132 // clang-format on
6133
6134 unsigned SelOpc =
6135 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6136
6137 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6138 .addImm(-1)
6139 .addImm(0);
6140
6141 MI.eraseFromParent();
6142 return BB;
6143 }
6144 case AMDGPU::SI_INIT_M0: {
6145 MachineOperand &M0Init = MI.getOperand(0);
6146 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6147 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6148 AMDGPU::M0)
6149 .add(M0Init);
6150 MI.eraseFromParent();
6151 return BB;
6152 }
6153 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6154 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6155 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6156 TII->get(AMDGPU::S_CMP_EQ_U32))
6157 .addImm(0)
6158 .addImm(0);
6159 return BB;
6160 }
6161 case AMDGPU::GET_GROUPSTATICSIZE: {
6162 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6163 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6164 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6165 .add(MI.getOperand(0))
6166 .addImm(MFI->getLDSSize());
6167 MI.eraseFromParent();
6168 return BB;
6169 }
6170 case AMDGPU::GET_SHADERCYCLESHILO: {
6172 // The algorithm is:
6173 //
6174 // hi1 = getreg(SHADER_CYCLES_HI)
6175 // lo1 = getreg(SHADER_CYCLES_LO)
6176 // hi2 = getreg(SHADER_CYCLES_HI)
6177 //
6178 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6179 // Otherwise there was overflow and the result is hi2:0. In both cases the
6180 // result should represent the actual time at some point during the sequence
6181 // of three getregs.
6182 using namespace AMDGPU::Hwreg;
6183 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6184 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6185 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6186 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6187 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
6188 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6189 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6190 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
6191 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6192 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
6193 .addReg(RegHi1)
6194 .addReg(RegHi2);
6195 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6196 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
6197 .addReg(RegLo1)
6198 .addImm(0);
6199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
6200 .add(MI.getOperand(0))
6201 .addReg(RegLo)
6202 .addImm(AMDGPU::sub0)
6203 .addReg(RegHi2)
6204 .addImm(AMDGPU::sub1);
6205 MI.eraseFromParent();
6206 return BB;
6207 }
6208 case AMDGPU::SI_INDIRECT_SRC_V1:
6209 case AMDGPU::SI_INDIRECT_SRC_V2:
6210 case AMDGPU::SI_INDIRECT_SRC_V4:
6211 case AMDGPU::SI_INDIRECT_SRC_V8:
6212 case AMDGPU::SI_INDIRECT_SRC_V9:
6213 case AMDGPU::SI_INDIRECT_SRC_V10:
6214 case AMDGPU::SI_INDIRECT_SRC_V11:
6215 case AMDGPU::SI_INDIRECT_SRC_V12:
6216 case AMDGPU::SI_INDIRECT_SRC_V16:
6217 case AMDGPU::SI_INDIRECT_SRC_V32:
6218 return emitIndirectSrc(MI, *BB, *getSubtarget());
6219 case AMDGPU::SI_INDIRECT_DST_V1:
6220 case AMDGPU::SI_INDIRECT_DST_V2:
6221 case AMDGPU::SI_INDIRECT_DST_V4:
6222 case AMDGPU::SI_INDIRECT_DST_V8:
6223 case AMDGPU::SI_INDIRECT_DST_V9:
6224 case AMDGPU::SI_INDIRECT_DST_V10:
6225 case AMDGPU::SI_INDIRECT_DST_V11:
6226 case AMDGPU::SI_INDIRECT_DST_V12:
6227 case AMDGPU::SI_INDIRECT_DST_V16:
6228 case AMDGPU::SI_INDIRECT_DST_V32:
6229 return emitIndirectDst(MI, *BB, *getSubtarget());
6230 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6231 case AMDGPU::SI_KILL_I1_PSEUDO:
6232 return splitKillBlock(MI, BB);
6233 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6234 Register Dst = MI.getOperand(0).getReg();
6235 const MachineOperand &Src0 = MI.getOperand(1);
6236 const MachineOperand &Src1 = MI.getOperand(2);
6237 Register SrcCond = MI.getOperand(3).getReg();
6238
6239 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6240 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6241 const auto *CondRC = TRI->getWaveMaskRegClass();
6242 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
6243
6244 const TargetRegisterClass *Src0RC = Src0.isReg()
6245 ? MRI.getRegClass(Src0.getReg())
6246 : &AMDGPU::VReg_64RegClass;
6247 const TargetRegisterClass *Src1RC = Src1.isReg()
6248 ? MRI.getRegClass(Src1.getReg())
6249 : &AMDGPU::VReg_64RegClass;
6250
6251 const TargetRegisterClass *Src0SubRC =
6252 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6253 const TargetRegisterClass *Src1SubRC =
6254 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6255
6256 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
6257 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6258 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
6259 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6260
6261 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
6262 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6263 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
6264 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6265
6266 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
6267 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
6268 .addImm(0)
6269 .add(Src0Sub0)
6270 .addImm(0)
6271 .add(Src1Sub0)
6272 .addReg(SrcCondCopy);
6273 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
6274 .addImm(0)
6275 .add(Src0Sub1)
6276 .addImm(0)
6277 .add(Src1Sub1)
6278 .addReg(SrcCondCopy);
6279
6280 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
6281 .addReg(DstLo)
6282 .addImm(AMDGPU::sub0)
6283 .addReg(DstHi)
6284 .addImm(AMDGPU::sub1);
6285 MI.eraseFromParent();
6286 return BB;
6287 }
6288 case AMDGPU::SI_BR_UNDEF: {
6289 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6290 .add(MI.getOperand(0));
6291 Br->getOperand(1).setIsUndef(); // read undef SCC
6292 MI.eraseFromParent();
6293 return BB;
6294 }
6295 case AMDGPU::ADJCALLSTACKUP:
6296 case AMDGPU::ADJCALLSTACKDOWN: {
6298 MachineInstrBuilder MIB(*MF, &MI);
6299 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
6300 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
6301 return BB;
6302 }
6303 case AMDGPU::SI_CALL_ISEL: {
6304 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
6305
6307 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6308
6309 for (const MachineOperand &MO : MI.operands())
6310 MIB.add(MO);
6311
6312 MIB.cloneMemRefs(MI);
6313 MI.eraseFromParent();
6314 return BB;
6315 }
6316 case AMDGPU::V_ADD_CO_U32_e32:
6317 case AMDGPU::V_SUB_CO_U32_e32:
6318 case AMDGPU::V_SUBREV_CO_U32_e32: {
6319 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
6320 unsigned Opc = MI.getOpcode();
6321
6322 bool NeedClampOperand = false;
6323 if (TII->pseudoToMCOpcode(Opc) == -1) {
6325 NeedClampOperand = true;
6326 }
6327
6328 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
6329 if (TII->isVOP3(*I)) {
6330 I.addReg(TRI->getVCC(), RegState::Define);
6331 }
6332 I.add(MI.getOperand(1)).add(MI.getOperand(2));
6333 if (NeedClampOperand)
6334 I.addImm(0); // clamp bit for e64 encoding
6335
6336 TII->legalizeOperands(*I);
6337
6338 MI.eraseFromParent();
6339 return BB;
6340 }
6341 case AMDGPU::V_ADDC_U32_e32:
6342 case AMDGPU::V_SUBB_U32_e32:
6343 case AMDGPU::V_SUBBREV_U32_e32:
6344 // These instructions have an implicit use of vcc which counts towards the
6345 // constant bus limit.
6346 TII->legalizeOperands(MI);
6347 return BB;
6348 case AMDGPU::DS_GWS_INIT:
6349 case AMDGPU::DS_GWS_SEMA_BR:
6350 case AMDGPU::DS_GWS_BARRIER:
6351 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
6352 [[fallthrough]];
6353 case AMDGPU::DS_GWS_SEMA_V:
6354 case AMDGPU::DS_GWS_SEMA_P:
6355 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6356 // A s_waitcnt 0 is required to be the instruction immediately following.
6357 if (getSubtarget()->hasGWSAutoReplay()) {
6359 return BB;
6360 }
6361
6362 return emitGWSMemViolTestLoop(MI, BB);
6363 case AMDGPU::S_SETREG_B32: {
6364 // Try to optimize cases that only set the denormal mode or rounding mode.
6365 //
6366 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
6367 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
6368 // instead.
6369 //
6370 // FIXME: This could be predicates on the immediate, but tablegen doesn't
6371 // allow you to have a no side effect instruction in the output of a
6372 // sideeffecting pattern.
6373 auto [ID, Offset, Width] =
6374 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
6376 return BB;
6377
6378 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
6379 const unsigned SetMask = WidthMask << Offset;
6380
6381 if (getSubtarget()->hasDenormModeInst()) {
6382 unsigned SetDenormOp = 0;
6383 unsigned SetRoundOp = 0;
6384
6385 // The dedicated instructions can only set the whole denorm or round mode
6386 // at once, not a subset of bits in either.
6387 if (SetMask ==
6389 // If this fully sets both the round and denorm mode, emit the two
6390 // dedicated instructions for these.
6391 SetRoundOp = AMDGPU::S_ROUND_MODE;
6392 SetDenormOp = AMDGPU::S_DENORM_MODE;
6393 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
6394 SetRoundOp = AMDGPU::S_ROUND_MODE;
6395 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
6396 SetDenormOp = AMDGPU::S_DENORM_MODE;
6397 }
6398
6399 if (SetRoundOp || SetDenormOp) {
6400 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
6401 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6402 unsigned ImmVal = Def->getOperand(1).getImm();
6403 if (SetRoundOp) {
6404 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
6405 .addImm(ImmVal & 0xf);
6406
6407 // If we also have the denorm mode, get just the denorm mode bits.
6408 ImmVal >>= 4;
6409 }
6410
6411 if (SetDenormOp) {
6412 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
6413 .addImm(ImmVal & 0xf);
6414 }
6415
6416 MI.eraseFromParent();
6417 return BB;
6418 }
6419 }
6420 }
6421
6422 // If only FP bits are touched, used the no side effects pseudo.
6423 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
6424 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
6425 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
6426
6427 return BB;
6428 }
6429 case AMDGPU::S_INVERSE_BALLOT_U32:
6430 case AMDGPU::S_INVERSE_BALLOT_U64:
6431 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
6432 // necessary. After that they are equivalent to a COPY.
6433 MI.setDesc(TII->get(AMDGPU::COPY));
6434 return BB;
6435 case AMDGPU::ENDPGM_TRAP: {
6436 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
6437 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
6438 MI.addOperand(MachineOperand::CreateImm(0));
6439 return BB;
6440 }
6441
6442 // We need a block split to make the real endpgm a terminator. We also don't
6443 // want to break phis in successor blocks, so we can't just delete to the
6444 // end of the block.
6445
6446 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
6448 MF->push_back(TrapBB);
6449 // clang-format off
6450 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
6451 .addImm(0);
6452 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
6453 .addMBB(TrapBB);
6454 // clang-format on
6455
6456 BB->addSuccessor(TrapBB);
6457 MI.eraseFromParent();
6458 return SplitBB;
6459 }
6460 case AMDGPU::SIMULATED_TRAP: {
6461 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6462 MachineBasicBlock *SplitBB =
6463 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
6464 MI.eraseFromParent();
6465 return SplitBB;
6466 }
6467 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6468 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6470
6471 // During ISel, it's difficult to propagate the original EXEC mask to use as
6472 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
6473 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
6474 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6475 Register OriginalExec = Setup->getOperand(0).getReg();
6476 MF->getRegInfo().clearKillFlags(OriginalExec);
6477 MI.getOperand(0).setReg(OriginalExec);
6478 return BB;
6479 }
6480 default:
6481 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
6482 if (!MI.mayStore())
6484 return BB;
6485 }
6487 }
6488}
6489
6491 // This currently forces unfolding various combinations of fsub into fma with
6492 // free fneg'd operands. As long as we have fast FMA (controlled by
6493 // isFMAFasterThanFMulAndFAdd), we should perform these.
6494
6495 // When fma is quarter rate, for f64 where add / sub are at best half rate,
6496 // most of these combines appear to be cycle neutral but save on instruction
6497 // count / code size.
6498 return true;
6499}
6500
6502
6504 EVT VT) const {
6505 if (!VT.isVector()) {
6506 return MVT::i1;
6507 }
6508 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
6509}
6510
6512 // TODO: Should i16 be used always if legal? For now it would force VALU
6513 // shifts.
6514 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6515}
6516
6518 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6519 ? Ty.changeElementSize(16)
6520 : Ty.changeElementSize(32);
6521}
6522
6523// Answering this is somewhat tricky and depends on the specific device which
6524// have different rates for fma or all f64 operations.
6525//
6526// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
6527// regardless of which device (although the number of cycles differs between
6528// devices), so it is always profitable for f64.
6529//
6530// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
6531// only on full rate devices. Normally, we should prefer selecting v_mad_f32
6532// which we can always do even without fused FP ops since it returns the same
6533// result as the separate operations and since it is always full
6534// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
6535// however does not support denormals, so we do report fma as faster if we have
6536// a fast fma device and require denormals.
6537//
6539 EVT VT) const {
6540 VT = VT.getScalarType();
6541
6542 switch (VT.getSimpleVT().SimpleTy) {
6543 case MVT::f32: {
6544 // If mad is not available this depends only on if f32 fma is full rate.
6545 if (!Subtarget->hasMadMacF32Insts())
6546 return Subtarget->hasFastFMAF32();
6547
6548 // Otherwise f32 mad is always full rate and returns the same result as
6549 // the separate operations so should be preferred over fma.
6550 // However does not support denormals.
6552 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6553
6554 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
6555 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6556 }
6557 case MVT::f64:
6558 return true;
6559 case MVT::f16:
6560 case MVT::bf16:
6561 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
6562 default:
6563 break;
6564 }
6565
6566 return false;
6567}
6568
6570 LLT Ty) const {
6571 switch (Ty.getScalarSizeInBits()) {
6572 case 16:
6573 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
6574 case 32:
6575 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
6576 case 64:
6577 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
6578 default:
6579 break;
6580 }
6581
6582 return false;
6583}
6584
6586 if (!Ty.isScalar())
6587 return false;
6588
6589 if (Ty.getScalarSizeInBits() == 16)
6590 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
6591 if (Ty.getScalarSizeInBits() == 32)
6592 return Subtarget->hasMadMacF32Insts() &&
6593 denormalModeIsFlushAllF32(*MI.getMF());
6594
6595 return false;
6596}
6597
6599 const SDNode *N) const {
6600 // TODO: Check future ftz flag
6601 // v_mad_f32/v_mac_f32 do not support denormals.
6602 EVT VT = N->getValueType(0);
6603 if (VT == MVT::f32)
6604 return Subtarget->hasMadMacF32Insts() &&
6606 if (VT == MVT::f16) {
6607 return Subtarget->hasMadF16() &&
6609 }
6610
6611 return false;
6612}
6613
6614//===----------------------------------------------------------------------===//
6615// Custom DAG Lowering Operations
6616//===----------------------------------------------------------------------===//
6617
6618// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6619// wider vector type is legal.
6621 SelectionDAG &DAG) const {
6622 unsigned Opc = Op.getOpcode();
6623 EVT VT = Op.getValueType();
6624 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
6625 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
6626 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6627 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
6628
6629 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
6630
6631 SDLoc SL(Op);
6632 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
6633 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
6634
6635 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6636}
6637
6638// Enable lowering of ROTR for vxi32 types. This is a workaround for a
6639// regression whereby extra unnecessary instructions were added to codegen
6640// for rotr operations, casued by legalising v2i32 or. This resulted in extra
6641// instructions to extract the result from the vector.
6643 [[maybe_unused]] EVT VT = Op.getValueType();
6644
6645 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6646 VT == MVT::v16i32) &&
6647 "Unexpected ValueType.");
6648
6649 return DAG.UnrollVectorOp(Op.getNode());
6650}
6651
6652// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
6653// wider vector type is legal.
6655 SelectionDAG &DAG) const {
6656 unsigned Opc = Op.getOpcode();
6657 EVT VT = Op.getValueType();
6658 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6659 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6660 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6661 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6662 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6663 VT == MVT::v32bf16);
6664
6665 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
6666 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6667
6668 SDLoc SL(Op);
6669
6670 SDValue OpLo =
6671 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
6672 SDValue OpHi =
6673 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
6674
6675 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6676}
6677
6679 SelectionDAG &DAG) const {
6680 unsigned Opc = Op.getOpcode();
6681 EVT VT = Op.getValueType();
6682 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6683 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6684 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6685 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6686 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6687 VT == MVT::v32bf16);
6688
6689 SDValue Op0 = Op.getOperand(0);
6690 auto [Lo0, Hi0] = Op0.getValueType().isVector()
6691 ? DAG.SplitVectorOperand(Op.getNode(), 0)
6692 : std::pair(Op0, Op0);
6693
6694 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
6695 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
6696
6697 SDLoc SL(Op);
6698 auto ResVT = DAG.GetSplitDestVTs(VT);
6699
6700 SDValue OpLo =
6701 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
6702 SDValue OpHi =
6703 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
6704
6705 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
6706}
6707
6709 switch (Op.getOpcode()) {
6710 default:
6712 case ISD::BRCOND:
6713 return LowerBRCOND(Op, DAG);
6714 case ISD::RETURNADDR:
6715 return LowerRETURNADDR(Op, DAG);
6716 case ISD::LOAD: {
6717 SDValue Result = LowerLOAD(Op, DAG);
6718 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6719 "Load should return a value and a chain");
6720 return Result;
6721 }
6722 case ISD::FSQRT: {
6723 EVT VT = Op.getValueType();
6724 if (VT == MVT::f32)
6725 return lowerFSQRTF32(Op, DAG);
6726 if (VT == MVT::f64)
6727 return lowerFSQRTF64(Op, DAG);
6728 return SDValue();
6729 }
6730 case ISD::FSIN:
6731 case ISD::FCOS:
6732 return LowerTrig(Op, DAG);
6733 case ISD::SELECT:
6734 return LowerSELECT(Op, DAG);
6735 case ISD::FDIV:
6736 return LowerFDIV(Op, DAG);
6737 case ISD::FFREXP:
6738 return LowerFFREXP(Op, DAG);
6739 case ISD::ATOMIC_CMP_SWAP:
6740 return LowerATOMIC_CMP_SWAP(Op, DAG);
6741 case ISD::STORE:
6742 return LowerSTORE(Op, DAG);
6743 case ISD::GlobalAddress: {
6746 return LowerGlobalAddress(MFI, Op, DAG);
6747 }
6749 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6751 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6753 return LowerINTRINSIC_VOID(Op, DAG);
6754 case ISD::ADDRSPACECAST:
6755 return lowerADDRSPACECAST(Op, DAG);
6757 return lowerINSERT_SUBVECTOR(Op, DAG);
6759 return lowerINSERT_VECTOR_ELT(Op, DAG);
6761 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
6763 return lowerVECTOR_SHUFFLE(Op, DAG);
6765 return lowerSCALAR_TO_VECTOR(Op, DAG);
6766 case ISD::BUILD_VECTOR:
6767 return lowerBUILD_VECTOR(Op, DAG);
6768 case ISD::FP_ROUND:
6770 return lowerFP_ROUND(Op, DAG);
6771 case ISD::TRAP:
6772 return lowerTRAP(Op, DAG);
6773 case ISD::DEBUGTRAP:
6774 return lowerDEBUGTRAP(Op, DAG);
6775 case ISD::ABS:
6776 case ISD::FABS:
6777 case ISD::FNEG:
6778 case ISD::FCANONICALIZE:
6779 case ISD::BSWAP:
6780 return splitUnaryVectorOp(Op, DAG);
6781 case ISD::FMINNUM:
6782 case ISD::FMAXNUM:
6783 return lowerFMINNUM_FMAXNUM(Op, DAG);
6784 case ISD::FMINIMUMNUM:
6785 case ISD::FMAXIMUMNUM:
6786 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
6787 case ISD::FMINIMUM:
6788 case ISD::FMAXIMUM:
6789 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
6790 case ISD::FLDEXP:
6791 case ISD::STRICT_FLDEXP:
6792 return lowerFLDEXP(Op, DAG);
6793 case ISD::FMA:
6794 return splitTernaryVectorOp(Op, DAG);
6795 case ISD::FP_TO_SINT:
6796 case ISD::FP_TO_UINT:
6797 return LowerFP_TO_INT(Op, DAG);
6798 case ISD::SHL:
6799 case ISD::SRA:
6800 case ISD::SRL:
6801 case ISD::ADD:
6802 case ISD::SUB:
6803 case ISD::SMIN:
6804 case ISD::SMAX:
6805 case ISD::UMIN:
6806 case ISD::UMAX:
6807 case ISD::FADD:
6808 case ISD::FMUL:
6809 case ISD::FMINNUM_IEEE:
6810 case ISD::FMAXNUM_IEEE:
6811 case ISD::UADDSAT:
6812 case ISD::USUBSAT:
6813 case ISD::SADDSAT:
6814 case ISD::SSUBSAT:
6815 return splitBinaryVectorOp(Op, DAG);
6816 case ISD::FCOPYSIGN:
6817 return lowerFCOPYSIGN(Op, DAG);
6818 case ISD::MUL:
6819 return lowerMUL(Op, DAG);
6820 case ISD::SMULO:
6821 case ISD::UMULO:
6822 return lowerXMULO(Op, DAG);
6823 case ISD::SMUL_LOHI:
6824 case ISD::UMUL_LOHI:
6825 return lowerXMUL_LOHI(Op, DAG);
6826 case ISD::DYNAMIC_STACKALLOC:
6827 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6828 case ISD::STACKSAVE:
6829 return LowerSTACKSAVE(Op, DAG);
6830 case ISD::GET_ROUNDING:
6831 return lowerGET_ROUNDING(Op, DAG);
6832 case ISD::SET_ROUNDING:
6833 return lowerSET_ROUNDING(Op, DAG);
6834 case ISD::PREFETCH:
6835 return lowerPREFETCH(Op, DAG);
6836 case ISD::FP_EXTEND:
6838 return lowerFP_EXTEND(Op, DAG);
6839 case ISD::GET_FPENV:
6840 return lowerGET_FPENV(Op, DAG);
6841 case ISD::SET_FPENV:
6842 return lowerSET_FPENV(Op, DAG);
6843 case ISD::ROTR:
6844 return lowerROTR(Op, DAG);
6845 }
6846 return SDValue();
6847}
6848
6849// Used for D16: Casts the result of an instruction into the right vector,
6850// packs values if loads return unpacked values.
6852 const SDLoc &DL, SelectionDAG &DAG,
6853 bool Unpacked) {
6854 if (!LoadVT.isVector())
6855 return Result;
6856
6857 // Cast back to the original packed type or to a larger type that is a
6858 // multiple of 32 bit for D16. Widening the return type is a required for
6859 // legalization.
6860 EVT FittingLoadVT = LoadVT;
6861 if ((LoadVT.getVectorNumElements() % 2) == 1) {
6862 FittingLoadVT =
6864 LoadVT.getVectorNumElements() + 1);
6865 }
6866
6867 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
6868 // Truncate to v2i16/v4i16.
6869 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
6870
6871 // Workaround legalizer not scalarizing truncate after vector op
6872 // legalization but not creating intermediate vector trunc.
6874 DAG.ExtractVectorElements(Result, Elts);
6875 for (SDValue &Elt : Elts)
6876 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6877
6878 // Pad illegal v1i16/v3fi6 to v4i16
6879 if ((LoadVT.getVectorNumElements() % 2) == 1)
6880 Elts.push_back(DAG.getPOISON(MVT::i16));
6881
6882 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
6883
6884 // Bitcast to original type (v2f16/v4f16).
6885 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6886 }
6887
6888 // Cast back to the original packed type.
6889 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
6890}
6891
6892SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
6893 SelectionDAG &DAG,
6895 bool IsIntrinsic) const {
6896 SDLoc DL(M);
6897
6898 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6899 EVT LoadVT = M->getValueType(0);
6900
6901 EVT EquivLoadVT = LoadVT;
6902 if (LoadVT.isVector()) {
6903 if (Unpacked) {
6904 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6905 LoadVT.getVectorNumElements());
6906 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
6907 // Widen v3f16 to legal type
6908 EquivLoadVT =
6910 LoadVT.getVectorNumElements() + 1);
6911 }
6912 }
6913
6914 // Change from v4f16/v2f16 to EquivLoadVT.
6915 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
6916
6918 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
6919 M->getMemoryVT(), M->getMemOperand());
6920
6921 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
6922
6923 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
6924}
6925
6926SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
6927 SelectionDAG &DAG,
6928 ArrayRef<SDValue> Ops) const {
6929 SDLoc DL(M);
6930 EVT LoadVT = M->getValueType(0);
6931 EVT EltType = LoadVT.getScalarType();
6932 EVT IntVT = LoadVT.changeTypeToInteger();
6933
6934 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
6935
6936 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
6937 bool IsTFE = M->getNumValues() == 3;
6938
6939 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
6941 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
6942 : AMDGPUISD::BUFFER_LOAD;
6943
6944 if (IsD16) {
6945 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
6946 }
6947
6948 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6949 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
6950 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
6951 IsTFE);
6952
6953 if (isTypeLegal(LoadVT)) {
6954 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
6955 M->getMemOperand(), DAG);
6956 }
6957
6958 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
6959 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
6960 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
6961 M->getMemOperand(), DAG);
6962 return DAG.getMergeValues(
6963 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6964 DL);
6965}
6966
6968 SelectionDAG &DAG) {
6969 EVT VT = N->getValueType(0);
6970 unsigned CondCode = N->getConstantOperandVal(3);
6971 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6972 return DAG.getPOISON(VT);
6973
6974 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6975
6976 SDValue LHS = N->getOperand(1);
6977 SDValue RHS = N->getOperand(2);
6978
6979 SDLoc DL(N);
6980
6981 EVT CmpVT = LHS.getValueType();
6982 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6983 unsigned PromoteOp =
6985 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6986 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6987 }
6988
6989 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6990
6991 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6992 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6993
6994 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6995 DAG.getCondCode(CCOpcode));
6996 if (VT.bitsEq(CCVT))
6997 return SetCC;
6998 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6999}
7000
7002 SelectionDAG &DAG) {
7003 EVT VT = N->getValueType(0);
7004
7005 unsigned CondCode = N->getConstantOperandVal(3);
7006 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7007 return DAG.getPOISON(VT);
7008
7009 SDValue Src0 = N->getOperand(1);
7010 SDValue Src1 = N->getOperand(2);
7011 EVT CmpVT = Src0.getValueType();
7012 SDLoc SL(N);
7013
7014 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7015 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7016 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7017 }
7018
7019 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7020 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7021 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7022 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7023 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7024 DAG.getCondCode(CCOpcode));
7025 if (VT.bitsEq(CCVT))
7026 return SetCC;
7027 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7028}
7029
7031 SelectionDAG &DAG) {
7032 EVT VT = N->getValueType(0);
7033 SDValue Src = N->getOperand(1);
7034 SDLoc SL(N);
7035
7036 if (Src.getOpcode() == ISD::SETCC) {
7037 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7038 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
7039 Src.getOperand(1), Src.getOperand(2));
7040 }
7041 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7042 // (ballot 0) -> 0
7043 if (Arg->isZero())
7044 return DAG.getConstant(0, SL, VT);
7045
7046 // (ballot 1) -> EXEC/EXEC_LO
7047 if (Arg->isOne()) {
7048 Register Exec;
7049 if (VT.getScalarSizeInBits() == 32)
7050 Exec = AMDGPU::EXEC_LO;
7051 else if (VT.getScalarSizeInBits() == 64)
7052 Exec = AMDGPU::EXEC;
7053 else
7054 return SDValue();
7055
7056 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7057 }
7058 }
7059
7060 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7061 // ISD::SETNE)
7062 return DAG.getNode(
7063 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7064 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7065}
7066
7068 SelectionDAG &DAG) {
7069 EVT VT = N->getValueType(0);
7070 unsigned ValSize = VT.getSizeInBits();
7071 unsigned IID = N->getConstantOperandVal(0);
7072 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7073 IID == Intrinsic::amdgcn_permlanex16;
7074 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7075 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7076 SDLoc SL(N);
7077 MVT IntVT = MVT::getIntegerVT(ValSize);
7078 const GCNSubtarget *ST = TLI.getSubtarget();
7079 unsigned SplitSize = 32;
7080 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7081 ST->hasDPALU_DPP() &&
7082 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7083 SplitSize = 64;
7084
7085 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7086 SDValue Src2, MVT ValT) -> SDValue {
7088 switch (IID) {
7089 case Intrinsic::amdgcn_permlane16:
7090 case Intrinsic::amdgcn_permlanex16:
7091 case Intrinsic::amdgcn_update_dpp:
7092 Operands.push_back(N->getOperand(6));
7093 Operands.push_back(N->getOperand(5));
7094 Operands.push_back(N->getOperand(4));
7095 [[fallthrough]];
7096 case Intrinsic::amdgcn_writelane:
7097 Operands.push_back(Src2);
7098 [[fallthrough]];
7099 case Intrinsic::amdgcn_readlane:
7100 case Intrinsic::amdgcn_set_inactive:
7101 case Intrinsic::amdgcn_set_inactive_chain_arg:
7102 case Intrinsic::amdgcn_mov_dpp8:
7103 Operands.push_back(Src1);
7104 [[fallthrough]];
7105 case Intrinsic::amdgcn_readfirstlane:
7106 case Intrinsic::amdgcn_permlane64:
7107 Operands.push_back(Src0);
7108 break;
7109 default:
7110 llvm_unreachable("unhandled lane op");
7111 }
7112
7113 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7114 std::reverse(Operands.begin(), Operands.end());
7115
7116 if (SDNode *GL = N->getGluedNode()) {
7117 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7118 GL = GL->getOperand(0).getNode();
7119 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7120 SDValue(GL, 0)));
7121 }
7122
7123 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7124 };
7125
7126 SDValue Src0 = N->getOperand(1);
7127 SDValue Src1, Src2;
7128 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7129 IID == Intrinsic::amdgcn_mov_dpp8 ||
7130 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7131 Src1 = N->getOperand(2);
7132 if (IID == Intrinsic::amdgcn_writelane ||
7133 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7134 Src2 = N->getOperand(3);
7135 }
7136
7137 if (ValSize == SplitSize) {
7138 // Already legal
7139 return SDValue();
7140 }
7141
7142 if (ValSize < 32) {
7143 bool IsFloat = VT.isFloatingPoint();
7144 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7145 SL, MVT::i32);
7146
7147 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7148 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7149 SL, MVT::i32);
7150 }
7151
7152 if (IID == Intrinsic::amdgcn_writelane) {
7153 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7154 SL, MVT::i32);
7155 }
7156
7157 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7158 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7159 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7160 }
7161
7162 if (ValSize % SplitSize != 0)
7163 return SDValue();
7164
7165 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7166 EVT VT = N->getValueType(0);
7167 unsigned NE = VT.getVectorNumElements();
7168 EVT EltVT = VT.getVectorElementType();
7170 unsigned NumOperands = N->getNumOperands();
7171 SmallVector<SDValue, 4> Operands(NumOperands);
7172 SDNode *GL = N->getGluedNode();
7173
7174 // only handle convergencectrl_glue
7175 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7176
7177 for (unsigned i = 0; i != NE; ++i) {
7178 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7179 ++j) {
7180 SDValue Operand = N->getOperand(j);
7181 EVT OperandVT = Operand.getValueType();
7182 if (OperandVT.isVector()) {
7183 // A vector operand; extract a single element.
7184 EVT OperandEltVT = OperandVT.getVectorElementType();
7185 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7186 Operand, DAG.getVectorIdxConstant(i, SL));
7187 } else {
7188 // A scalar operand; just use it as is.
7189 Operands[j] = Operand;
7190 }
7191 }
7192
7193 if (GL)
7194 Operands[NumOperands - 1] =
7195 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7196 SDValue(GL->getOperand(0).getNode(), 0));
7197
7198 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7199 }
7200
7201 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7202 return DAG.getBuildVector(VecVT, SL, Scalars);
7203 };
7204
7205 if (VT.isVector()) {
7206 switch (MVT::SimpleValueType EltTy =
7208 case MVT::i32:
7209 case MVT::f32:
7210 if (SplitSize == 32) {
7211 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
7212 return unrollLaneOp(LaneOp.getNode());
7213 }
7214 [[fallthrough]];
7215 case MVT::i16:
7216 case MVT::f16:
7217 case MVT::bf16: {
7218 unsigned SubVecNumElt =
7219 SplitSize / VT.getVectorElementType().getSizeInBits();
7220 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
7222 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7223 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7224 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
7225 DAG.getConstant(EltIdx, SL, MVT::i32));
7226
7227 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7228 IsPermLane16)
7229 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
7230 DAG.getConstant(EltIdx, SL, MVT::i32));
7231
7232 if (IID == Intrinsic::amdgcn_writelane)
7233 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
7234 DAG.getConstant(EltIdx, SL, MVT::i32));
7235
7236 Pieces.push_back(
7237 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7238 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7239 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7240 EltIdx += SubVecNumElt;
7241 }
7242 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
7243 }
7244 default:
7245 // Handle all other cases by bitcasting to i32 vectors
7246 break;
7247 }
7248 }
7249
7250 MVT VecVT =
7251 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
7252 Src0 = DAG.getBitcast(VecVT, Src0);
7253
7254 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7255 Src1 = DAG.getBitcast(VecVT, Src1);
7256
7257 if (IID == Intrinsic::amdgcn_writelane)
7258 Src2 = DAG.getBitcast(VecVT, Src2);
7259
7260 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7261 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
7262 return DAG.getBitcast(VT, UnrolledLaneOp);
7263}
7264
7267 SelectionDAG &DAG) const {
7268 switch (N->getOpcode()) {
7270 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
7271 Results.push_back(Res);
7272 return;
7273 }
7275 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
7276 Results.push_back(Res);
7277 return;
7278 }
7280 unsigned IID = N->getConstantOperandVal(0);
7281 switch (IID) {
7282 case Intrinsic::amdgcn_make_buffer_rsrc:
7283 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
7284 return;
7285 case Intrinsic::amdgcn_cvt_pkrtz: {
7286 SDValue Src0 = N->getOperand(1);
7287 SDValue Src1 = N->getOperand(2);
7288 SDLoc SL(N);
7289 SDValue Cvt =
7290 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7291 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7292 return;
7293 }
7294 case Intrinsic::amdgcn_cvt_pknorm_i16:
7295 case Intrinsic::amdgcn_cvt_pknorm_u16:
7296 case Intrinsic::amdgcn_cvt_pk_i16:
7297 case Intrinsic::amdgcn_cvt_pk_u16: {
7298 SDValue Src0 = N->getOperand(1);
7299 SDValue Src1 = N->getOperand(2);
7300 SDLoc SL(N);
7301 unsigned Opcode;
7302
7303 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7305 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7307 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7309 else
7311
7312 EVT VT = N->getValueType(0);
7313 if (isTypeLegal(VT))
7314 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
7315 else {
7316 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
7317 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7318 }
7319 return;
7320 }
7321 case Intrinsic::amdgcn_s_buffer_load: {
7322 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
7323 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
7324 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
7325 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
7326 // s_buffer_load_i8.
7327 if (!Subtarget->hasScalarSubwordLoads())
7328 return;
7329 SDValue Op = SDValue(N, 0);
7330 SDValue Rsrc = Op.getOperand(1);
7331 SDValue Offset = Op.getOperand(2);
7332 SDValue CachePolicy = Op.getOperand(3);
7333 EVT VT = Op.getValueType();
7334 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
7335 SDLoc DL(Op);
7337 const DataLayout &DataLayout = DAG.getDataLayout();
7338 Align Alignment =
7344 VT.getStoreSize(), Alignment);
7345 SDValue LoadVal;
7346 if (!Offset->isDivergent()) {
7347 SDValue Ops[] = {Rsrc, // source register
7348 Offset, CachePolicy};
7349 SDValue BufferLoad =
7351 DAG.getVTList(MVT::i32), Ops, VT, MMO);
7352 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
7353 } else {
7354 SDValue Ops[] = {
7355 DAG.getEntryNode(), // Chain
7356 Rsrc, // rsrc
7357 DAG.getConstant(0, DL, MVT::i32), // vindex
7358 {}, // voffset
7359 {}, // soffset
7360 {}, // offset
7361 CachePolicy, // cachepolicy
7362 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7363 };
7364 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
7365 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
7366 }
7367 Results.push_back(LoadVal);
7368 return;
7369 }
7370 case Intrinsic::amdgcn_dead: {
7371 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
7372 Results.push_back(DAG.getPOISON(N->getValueType(I)));
7373 return;
7374 }
7375 }
7376 break;
7377 }
7379 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
7380 if (Res.getOpcode() == ISD::MERGE_VALUES) {
7381 // FIXME: Hacky
7382 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
7383 Results.push_back(Res.getOperand(I));
7384 }
7385 } else {
7386 Results.push_back(Res);
7387 Results.push_back(Res.getValue(1));
7388 }
7389 return;
7390 }
7391
7392 break;
7393 }
7394 case ISD::SELECT: {
7395 SDLoc SL(N);
7396 EVT VT = N->getValueType(0);
7397 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
7398 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
7399 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
7400
7401 EVT SelectVT = NewVT;
7402 if (NewVT.bitsLT(MVT::i32)) {
7403 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
7404 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
7405 SelectVT = MVT::i32;
7406 }
7407
7408 SDValue NewSelect =
7409 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
7410
7411 if (NewVT != SelectVT)
7412 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
7413 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
7414 return;
7415 }
7416 case ISD::FNEG: {
7417 if (N->getValueType(0) != MVT::v2f16)
7418 break;
7419
7420 SDLoc SL(N);
7421 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7422
7423 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
7424 DAG.getConstant(0x80008000, SL, MVT::i32));
7425 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7426 return;
7427 }
7428 case ISD::FABS: {
7429 if (N->getValueType(0) != MVT::v2f16)
7430 break;
7431
7432 SDLoc SL(N);
7433 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
7434
7435 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
7436 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
7437 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
7438 return;
7439 }
7440 case ISD::FSQRT: {
7441 if (N->getValueType(0) != MVT::f16)
7442 break;
7443 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
7444 break;
7445 }
7446 default:
7448 break;
7449 }
7450}
7451
7452/// Helper function for LowerBRCOND
7453static SDNode *findUser(SDValue Value, unsigned Opcode) {
7454
7455 for (SDUse &U : Value->uses()) {
7456 if (U.get() != Value)
7457 continue;
7458
7459 if (U.getUser()->getOpcode() == Opcode)
7460 return U.getUser();
7461 }
7462 return nullptr;
7463}
7464
7465unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
7466 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
7467 switch (Intr->getConstantOperandVal(1)) {
7468 case Intrinsic::amdgcn_if:
7469 return AMDGPUISD::IF;
7470 case Intrinsic::amdgcn_else:
7471 return AMDGPUISD::ELSE;
7472 case Intrinsic::amdgcn_loop:
7473 return AMDGPUISD::LOOP;
7474 case Intrinsic::amdgcn_end_cf:
7475 llvm_unreachable("should not occur");
7476 default:
7477 return 0;
7478 }
7479 }
7480
7481 // break, if_break, else_break are all only used as inputs to loop, not
7482 // directly as branch conditions.
7483 return 0;
7484}
7485
7492
7494 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7495 return false;
7496
7497 // FIXME: Either avoid relying on address space here or change the default
7498 // address space for functions to avoid the explicit check.
7499 return (GV->getValueType()->isFunctionTy() ||
7502}
7503
7505 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
7506}
7507
7509 if (!GV->hasExternalLinkage())
7510 return true;
7511
7512 const auto OS = getTargetMachine().getTargetTriple().getOS();
7513 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
7514}
7515
7516/// This transforms the control flow intrinsics to get the branch destination as
7517/// last parameter, also switches branch target with BR if the need arise
7518SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
7519 SDLoc DL(BRCOND);
7520
7521 SDNode *Intr = BRCOND.getOperand(1).getNode();
7522 SDValue Target = BRCOND.getOperand(2);
7523 SDNode *BR = nullptr;
7524 SDNode *SetCC = nullptr;
7525
7526 switch (Intr->getOpcode()) {
7527 case ISD::SETCC: {
7528 // As long as we negate the condition everything is fine
7529 SetCC = Intr;
7530 Intr = SetCC->getOperand(0).getNode();
7531 break;
7532 }
7533 case ISD::XOR: {
7534 // Similar to SETCC, if we have (xor c, -1), we will be fine.
7535 SDValue LHS = Intr->getOperand(0);
7536 SDValue RHS = Intr->getOperand(1);
7537 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
7538 Intr = LHS.getNode();
7539 break;
7540 }
7541 [[fallthrough]];
7542 }
7543 default: {
7544 // Get the target from BR if we don't negate the condition
7545 BR = findUser(BRCOND, ISD::BR);
7546 assert(BR && "brcond missing unconditional branch user");
7547 Target = BR->getOperand(1);
7548 }
7549 }
7550
7551 unsigned CFNode = isCFIntrinsic(Intr);
7552 if (CFNode == 0) {
7553 // This is a uniform branch so we don't need to legalize.
7554 return BRCOND;
7555 }
7556
7557 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
7559
7560 assert(!SetCC ||
7561 (SetCC->getConstantOperandVal(1) == 1 &&
7562 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
7563 ISD::SETNE));
7564
7565 // operands of the new intrinsic call
7567 if (HaveChain)
7568 Ops.push_back(BRCOND.getOperand(0));
7569
7570 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
7571 Ops.push_back(Target);
7572
7573 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
7574
7575 // build the new intrinsic call
7576 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
7577
7578 if (!HaveChain) {
7579 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
7580
7582 }
7583
7584 if (BR) {
7585 // Give the branch instruction our target
7586 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
7587 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
7588 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
7589 }
7590
7591 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
7592
7593 // Copy the intrinsic results to registers
7594 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
7595 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
7596 if (!CopyToReg)
7597 continue;
7598
7599 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
7600 SDValue(Result, i - 1), SDValue());
7601
7602 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
7603 }
7604
7605 // Remove the old intrinsic from the chain
7606 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
7607 Intr->getOperand(0));
7608
7609 return Chain;
7610}
7611
7612SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
7613 MVT VT = Op.getSimpleValueType();
7614 SDLoc DL(Op);
7615 // Checking the depth
7616 if (Op.getConstantOperandVal(0) != 0)
7617 return DAG.getConstant(0, DL, VT);
7618
7619 MachineFunction &MF = DAG.getMachineFunction();
7620 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7621 // Check for kernel and shader functions
7622 if (Info->isEntryFunction())
7623 return DAG.getConstant(0, DL, VT);
7624
7625 MachineFrameInfo &MFI = MF.getFrameInfo();
7626 // There is a call to @llvm.returnaddress in this function
7627 MFI.setReturnAddressIsTaken(true);
7628
7629 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
7630 // Get the return address reg and mark it as an implicit live-in
7631 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
7632 getRegClassFor(VT, Op.getNode()->isDivergent()));
7633
7634 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
7635}
7636
7637SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
7638 const SDLoc &DL, EVT VT) const {
7639 return Op.getValueType().bitsLE(VT)
7640 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
7641 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
7642 DAG.getTargetConstant(0, DL, MVT::i32));
7643}
7644
7645SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
7646 SelectionDAG &DAG) const {
7647 EVT DstVT = Op.getValueType();
7648 unsigned NumElts = DstVT.getVectorNumElements();
7649 assert(NumElts > 2 && isPowerOf2_32(NumElts));
7650
7651 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7652
7653 SDLoc DL(Op);
7654 unsigned Opc = Op.getOpcode();
7655 SDValue Flags = Op.getOperand(1);
7656 EVT HalfDstVT =
7657 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
7658 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
7659 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
7660
7661 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
7662}
7663
7664SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
7665 SDValue Src = Op.getOperand(0);
7666 EVT SrcVT = Src.getValueType();
7667 EVT DstVT = Op.getValueType();
7668
7669 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
7670 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
7671 if (SrcVT.getScalarType() != MVT::f32)
7672 return SDValue();
7673 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
7674 }
7675
7676 if (SrcVT.getScalarType() != MVT::f64)
7677 return Op;
7678
7679 SDLoc DL(Op);
7680 if (DstVT == MVT::f16) {
7681 // TODO: Handle strictfp
7682 if (Op.getOpcode() != ISD::FP_ROUND)
7683 return Op;
7684
7685 if (!Subtarget->has16BitInsts()) {
7686 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
7687 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7688 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7689 }
7690 if (Op->getFlags().hasApproximateFuncs()) {
7691 SDValue Flags = Op.getOperand(1);
7692 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
7693 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
7694 }
7695 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
7696 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
7697 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
7698 }
7699
7700 assert(DstVT.getScalarType() == MVT::bf16 &&
7701 "custom lower FP_ROUND for f16 or bf16");
7702 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
7703
7704 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
7705 // hardware f32 -> bf16 instruction.
7706 EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) :
7707 MVT::f32;
7708 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
7709 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
7710 DAG.getTargetConstant(0, DL, MVT::i32));
7711}
7712
7713SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
7714 SelectionDAG &DAG) const {
7715 EVT VT = Op.getValueType();
7716 const MachineFunction &MF = DAG.getMachineFunction();
7717 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7718 bool IsIEEEMode = Info->getMode().IEEE;
7719
7720 // FIXME: Assert during selection that this is only selected for
7721 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
7722 // mode functions, but this happens to be OK since it's only done in cases
7723 // where there is known no sNaN.
7724 if (IsIEEEMode)
7725 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
7726
7727 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7728 VT == MVT::v16bf16)
7729 return splitBinaryVectorOp(Op, DAG);
7730 return Op;
7731}
7732
7733SDValue
7734SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
7735 SelectionDAG &DAG) const {
7736 EVT VT = Op.getValueType();
7737 const MachineFunction &MF = DAG.getMachineFunction();
7738 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
7739 bool IsIEEEMode = Info->getMode().IEEE;
7740
7741 if (IsIEEEMode)
7742 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
7743
7744 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7745 VT == MVT::v16bf16)
7746 return splitBinaryVectorOp(Op, DAG);
7747 return Op;
7748}
7749
7750SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
7751 SelectionDAG &DAG) const {
7752 EVT VT = Op.getValueType();
7753 if (VT.isVector())
7754 return splitBinaryVectorOp(Op, DAG);
7755
7756 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7757 !Subtarget->hasMinimum3Maximum3F16() &&
7758 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7759 "should not need to widen f16 minimum/maximum to v2f16");
7760
7761 // Widen f16 operation to v2f16
7762
7763 // fminimum f16:x, f16:y ->
7764 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
7765 // (v2f16 (scalar_to_vector y))), 0
7766 SDLoc SL(Op);
7767 SDValue WideSrc0 =
7768 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
7769 SDValue WideSrc1 =
7770 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
7771
7772 SDValue Widened =
7773 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7774
7775 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
7776 DAG.getConstant(0, SL, MVT::i32));
7777}
7778
7779SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
7780 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
7781 EVT VT = Op.getValueType();
7782 assert(VT == MVT::f16);
7783
7784 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
7785 EVT ExpVT = Exp.getValueType();
7786 if (ExpVT == MVT::i16)
7787 return Op;
7788
7789 SDLoc DL(Op);
7790
7791 // Correct the exponent type for f16 to i16.
7792 // Clamp the range of the exponent to the instruction's range.
7793
7794 // TODO: This should be a generic narrowing legalization, and can easily be
7795 // for GlobalISel.
7796
7797 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
7798 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
7799
7800 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
7801 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
7802
7803 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
7804
7805 if (IsStrict) {
7806 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
7807 {Op.getOperand(0), Op.getOperand(1), TruncExp});
7808 }
7809
7810 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
7811}
7812
7814 switch (Op->getOpcode()) {
7815 case ISD::SRA:
7816 case ISD::SMIN:
7817 case ISD::SMAX:
7818 return ISD::SIGN_EXTEND;
7819 case ISD::SRL:
7820 case ISD::UMIN:
7821 case ISD::UMAX:
7822 return ISD::ZERO_EXTEND;
7823 case ISD::ADD:
7824 case ISD::SUB:
7825 case ISD::AND:
7826 case ISD::OR:
7827 case ISD::XOR:
7828 case ISD::SHL:
7829 case ISD::SELECT:
7830 case ISD::MUL:
7831 // operation result won't be influenced by garbage high bits.
7832 // TODO: are all of those cases correct, and are there more?
7833 return ISD::ANY_EXTEND;
7834 case ISD::SETCC: {
7835 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7837 }
7838 default:
7839 llvm_unreachable("unexpected opcode!");
7840 }
7841}
7842
7843SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
7844 DAGCombinerInfo &DCI) const {
7845 const unsigned Opc = Op.getOpcode();
7846 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
7847 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
7848 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
7849 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
7850 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
7851
7852 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
7853 : Op->getOperand(0).getValueType();
7854 auto ExtTy = OpTy.changeElementType(MVT::i32);
7855
7856 if (DCI.isBeforeLegalizeOps() ||
7857 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
7858 return SDValue();
7859
7860 auto &DAG = DCI.DAG;
7861
7862 SDLoc DL(Op);
7863 SDValue LHS;
7864 SDValue RHS;
7865 if (Opc == ISD::SELECT) {
7866 LHS = Op->getOperand(1);
7867 RHS = Op->getOperand(2);
7868 } else {
7869 LHS = Op->getOperand(0);
7870 RHS = Op->getOperand(1);
7871 }
7872
7873 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
7874 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
7875
7876 // Special case: for shifts, the RHS always needs a zext.
7877 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
7878 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
7879 else
7880 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
7881
7882 // setcc always return i1/i1 vec so no need to truncate after.
7883 if (Opc == ISD::SETCC) {
7884 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
7885 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
7886 }
7887
7888 // For other ops, we extend the operation's return type as well so we need to
7889 // truncate back to the original type.
7890 SDValue NewVal;
7891 if (Opc == ISD::SELECT)
7892 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
7893 else
7894 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
7895
7896 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
7897}
7898
7899SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
7900 SDValue Mag = Op.getOperand(0);
7901 EVT MagVT = Mag.getValueType();
7902
7903 if (MagVT.getVectorNumElements() > 2)
7904 return splitBinaryVectorOp(Op, DAG);
7905
7906 SDValue Sign = Op.getOperand(1);
7907 EVT SignVT = Sign.getValueType();
7908
7909 if (MagVT == SignVT)
7910 return Op;
7911
7912 // fcopysign v2f16:mag, v2f32:sign ->
7913 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
7914
7915 SDLoc SL(Op);
7916 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7917 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
7918
7919 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7920
7921 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
7922}
7923
7924// Custom lowering for vector multiplications and s_mul_u64.
7925SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
7926 EVT VT = Op.getValueType();
7927
7928 // Split vector operands.
7929 if (VT.isVector())
7930 return splitBinaryVectorOp(Op, DAG);
7931
7932 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
7933
7934 // There are four ways to lower s_mul_u64:
7935 //
7936 // 1. If all the operands are uniform, then we lower it as it is.
7937 //
7938 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
7939 // multiplications because there is not a vector equivalent of s_mul_u64.
7940 //
7941 // 3. If the cost model decides that it is more efficient to use vector
7942 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
7943 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
7944 //
7945 // 4. If the cost model decides to use vector registers and both of the
7946 // operands are zero-extended/sign-extended from 32-bits, then we split the
7947 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
7948 // possible to check if the operands are zero-extended or sign-extended in
7949 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
7950 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
7951 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
7952 // If the cost model decides that we have to use vector registers, then
7953 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
7954 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
7955 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
7956 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
7957 // SIInstrInfo.cpp .
7958
7959 if (Op->isDivergent())
7960 return SDValue();
7961
7962 SDValue Op0 = Op.getOperand(0);
7963 SDValue Op1 = Op.getOperand(1);
7964 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
7965 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
7966 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
7967 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
7968 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
7969 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
7970 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
7971 SDLoc SL(Op);
7972 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7973 return SDValue(
7974 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7975 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
7976 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
7977 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7978 return SDValue(
7979 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7980 // If all the operands are uniform, then we lower s_mul_u64 as it is.
7981 return Op;
7982}
7983
7984SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
7985 EVT VT = Op.getValueType();
7986 SDLoc SL(Op);
7987 SDValue LHS = Op.getOperand(0);
7988 SDValue RHS = Op.getOperand(1);
7989 bool isSigned = Op.getOpcode() == ISD::SMULO;
7990
7991 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
7992 const APInt &C = RHSC->getAPIntValue();
7993 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
7994 if (C.isPowerOf2()) {
7995 // smulo(x, signed_min) is same as umulo(x, signed_min).
7996 bool UseArithShift = isSigned && !C.isMinSignedValue();
7997 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
7998 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
7999 SDValue Overflow =
8000 DAG.getSetCC(SL, MVT::i1,
8001 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8002 Result, ShiftAmt),
8003 LHS, ISD::SETNE);
8004 return DAG.getMergeValues({Result, Overflow}, SL);
8005 }
8006 }
8007
8008 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8009 SDValue Top =
8010 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8011
8012 SDValue Sign = isSigned
8013 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8014 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8015 SL, MVT::i32))
8016 : DAG.getConstant(0, SL, VT);
8017 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8018
8019 return DAG.getMergeValues({Result, Overflow}, SL);
8020}
8021
8022SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8023 if (Op->isDivergent()) {
8024 // Select to V_MAD_[IU]64_[IU]32.
8025 return Op;
8026 }
8027 if (Subtarget->hasSMulHi()) {
8028 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8029 return SDValue();
8030 }
8031 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8032 // calculate the high part, so we might as well do the whole thing with
8033 // V_MAD_[IU]64_[IU]32.
8034 return Op;
8035}
8036
8037SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8038 if (!Subtarget->isTrapHandlerEnabled() ||
8039 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8040 return lowerTrapEndpgm(Op, DAG);
8041
8042 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8043 : lowerTrapHsaQueuePtr(Op, DAG);
8044}
8045
8046SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8047 SDLoc SL(Op);
8048 SDValue Chain = Op.getOperand(0);
8049 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8050}
8051
8052SDValue
8053SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8054 const SDLoc &DL, Align Alignment,
8055 ImplicitParameter Param) const {
8056 MachineFunction &MF = DAG.getMachineFunction();
8057 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8058 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8059 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8060 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
8063}
8064
8065SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8066 SelectionDAG &DAG) const {
8067 SDLoc SL(Op);
8068 SDValue Chain = Op.getOperand(0);
8069
8070 SDValue QueuePtr;
8071 // For code object version 5, QueuePtr is passed through implicit kernarg.
8072 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8074 QueuePtr =
8075 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8076 } else {
8077 MachineFunction &MF = DAG.getMachineFunction();
8078 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8079 Register UserSGPR = Info->getQueuePtrUserSGPR();
8080
8081 if (UserSGPR == AMDGPU::NoRegister) {
8082 // We probably are in a function incorrectly marked with
8083 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8084 // trap, so just use a null pointer.
8085 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8086 } else {
8087 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8088 MVT::i64);
8089 }
8090 }
8091
8092 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8093 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8094
8095 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8096 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
8097 ToReg.getValue(1)};
8098 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8099}
8100
8101SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
8102 SDLoc SL(Op);
8103 SDValue Chain = Op.getOperand(0);
8104
8105 // We need to simulate the 's_trap 2' instruction on targets that run in
8106 // PRIV=1 (where it is treated as a nop).
8107 if (Subtarget->hasPrivEnabledTrap2NopBug())
8108 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8109
8110 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8111 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8112 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8113}
8114
8115SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
8116 SDLoc SL(Op);
8117 SDValue Chain = Op.getOperand(0);
8118 MachineFunction &MF = DAG.getMachineFunction();
8119
8120 if (!Subtarget->isTrapHandlerEnabled() ||
8121 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
8122 LLVMContext &Ctx = MF.getFunction().getContext();
8123 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
8124 "debugtrap handler not supported",
8125 Op.getDebugLoc(), DS_Warning));
8126 return Chain;
8127 }
8128
8129 uint64_t TrapID =
8130 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
8131 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
8132 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
8133}
8134
8135SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
8136 SelectionDAG &DAG) const {
8137 if (Subtarget->hasApertureRegs()) {
8138 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
8139 ? AMDGPU::SRC_SHARED_BASE
8140 : AMDGPU::SRC_PRIVATE_BASE;
8141 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8142 !Subtarget->hasGloballyAddressableScratch()) &&
8143 "Cannot use src_private_base with globally addressable scratch!");
8144 // Note: this feature (register) is broken. When used as a 32-bit operand,
8145 // it returns a wrong value (all zeroes?). The real value is in the upper 32
8146 // bits.
8147 //
8148 // To work around the issue, emit a 64 bit copy from this register
8149 // then extract the high bits. Note that this shouldn't even result in a
8150 // shift being emitted and simply become a pair of registers (e.g.):
8151 // s_mov_b64 s[6:7], src_shared_base
8152 // v_mov_b32_e32 v1, s7
8153 SDValue Copy =
8154 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
8155 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
8156 }
8157
8158 // For code object version 5, private_base and shared_base are passed through
8159 // implicit kernargs.
8160 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8164 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
8165 }
8166
8167 MachineFunction &MF = DAG.getMachineFunction();
8168 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8169 Register UserSGPR = Info->getQueuePtrUserSGPR();
8170 if (UserSGPR == AMDGPU::NoRegister) {
8171 // We probably are in a function incorrectly marked with
8172 // amdgpu-no-queue-ptr. This is undefined.
8173 return DAG.getPOISON(MVT::i32);
8174 }
8175
8176 SDValue QueuePtr =
8177 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
8178
8179 // Offset into amd_queue_t for group_segment_aperture_base_hi /
8180 // private_segment_aperture_base_hi.
8181 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
8182
8183 SDValue Ptr =
8184 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
8185
8186 // TODO: Use custom target PseudoSourceValue.
8187 // TODO: We should use the value from the IR intrinsic call, but it might not
8188 // be available and how do we get it?
8189 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
8190 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
8191 commonAlignment(Align(64), StructOffset),
8194}
8195
8196/// Return true if the value is a known valid address, such that a null check is
8197/// not necessary.
8199 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
8201 return true;
8202
8203 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
8204 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8205
8206 // TODO: Search through arithmetic, handle arguments and loads
8207 // marked nonnull.
8208 return false;
8209}
8210
8211SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
8212 SelectionDAG &DAG) const {
8213 SDLoc SL(Op);
8214
8215 const AMDGPUTargetMachine &TM =
8216 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
8217
8218 unsigned DestAS, SrcAS;
8219 SDValue Src;
8220 bool IsNonNull = false;
8221 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
8222 SrcAS = ASC->getSrcAddressSpace();
8223 Src = ASC->getOperand(0);
8224 DestAS = ASC->getDestAddressSpace();
8225 } else {
8226 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
8227 Op.getConstantOperandVal(0) ==
8228 Intrinsic::amdgcn_addrspacecast_nonnull);
8229 Src = Op->getOperand(1);
8230 SrcAS = Op->getConstantOperandVal(2);
8231 DestAS = Op->getConstantOperandVal(3);
8232 IsNonNull = true;
8233 }
8234
8235 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
8236
8237 // flat -> local/private
8238 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
8239 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
8240 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
8241 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8242
8243 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
8244 Subtarget->hasGloballyAddressableScratch()) {
8245 // flat -> private with globally addressable scratch: subtract
8246 // src_flat_scratch_base_lo.
8247 SDValue FlatScratchBaseLo(
8248 DAG.getMachineNode(
8249 AMDGPU::S_MOV_B32, SL, MVT::i32,
8250 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8251 0);
8252 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
8253 }
8254
8255 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8256 return Ptr;
8257
8258 unsigned NullVal = TM.getNullPointerValue(DestAS);
8259 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8260 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
8261
8262 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
8263 SegmentNullPtr);
8264 }
8265 }
8266
8267 // local/private -> flat
8268 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
8269 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
8270 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
8271 SDValue CvtPtr;
8272 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
8273 Subtarget->hasGloballyAddressableScratch()) {
8274 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
8275 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
8276 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
8277 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
8278 ThreadID = DAG.getNode(
8279 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8280 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
8281 AllOnes, ThreadID);
8282 if (Subtarget->isWave64())
8283 ThreadID = DAG.getNode(
8284 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
8285 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
8286 AllOnes, ThreadID);
8287 SDValue ShAmt = DAG.getShiftAmountConstant(
8288 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8289 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
8290 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
8291 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8292 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
8293 // 64-bit hi:lo value.
8294 SDValue FlatScratchBase = {
8295 DAG.getMachineNode(
8296 AMDGPU::S_MOV_B64, SL, MVT::i64,
8297 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8298 0};
8299 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8300 } else {
8301 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8302 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
8303 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8304 }
8305
8306 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
8307 return CvtPtr;
8308
8309 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8310 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
8311
8312 SDValue NonNull =
8313 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
8314
8315 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
8316 FlatNullPtr);
8317 }
8318 }
8319
8320 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8321 Op.getValueType() == MVT::i64) {
8322 const SIMachineFunctionInfo *Info =
8323 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
8324 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
8325 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
8326 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8327 }
8328
8329 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8330 Src.getValueType() == MVT::i64)
8331 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
8332
8333 // global <-> flat are no-ops and never emitted.
8334
8335 // Invalid casts are poison.
8336 return DAG.getPOISON(Op->getValueType(0));
8337}
8338
8339// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
8340// the small vector and inserting them into the big vector. That is better than
8341// the default expansion of doing it via a stack slot. Even though the use of
8342// the stack slot would be optimized away afterwards, the stack slot itself
8343// remains.
8344SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
8345 SelectionDAG &DAG) const {
8346 SDValue Vec = Op.getOperand(0);
8347 SDValue Ins = Op.getOperand(1);
8348 SDValue Idx = Op.getOperand(2);
8349 EVT VecVT = Vec.getValueType();
8350 EVT InsVT = Ins.getValueType();
8351 EVT EltVT = VecVT.getVectorElementType();
8352 unsigned InsNumElts = InsVT.getVectorNumElements();
8353 unsigned IdxVal = Idx->getAsZExtVal();
8354 SDLoc SL(Op);
8355
8356 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
8357 // Insert 32-bit registers at a time.
8358 assert(InsNumElts % 2 == 0 && "expect legal vector types");
8359
8360 unsigned VecNumElts = VecVT.getVectorNumElements();
8361 EVT NewVecVT =
8362 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
8363 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8365 MVT::i32, InsNumElts / 2);
8366
8367 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8368 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8369
8370 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
8371 SDValue Elt;
8372 if (InsNumElts == 2) {
8373 Elt = Ins;
8374 } else {
8375 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
8376 DAG.getConstant(I, SL, MVT::i32));
8377 }
8378 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
8379 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
8380 }
8381
8382 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
8383 }
8384
8385 for (unsigned I = 0; I != InsNumElts; ++I) {
8386 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
8387 DAG.getConstant(I, SL, MVT::i32));
8388 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
8389 DAG.getConstant(IdxVal + I, SL, MVT::i32));
8390 }
8391 return Vec;
8392}
8393
8394SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
8395 SelectionDAG &DAG) const {
8396 SDValue Vec = Op.getOperand(0);
8397 SDValue InsVal = Op.getOperand(1);
8398 SDValue Idx = Op.getOperand(2);
8399 EVT VecVT = Vec.getValueType();
8400 EVT EltVT = VecVT.getVectorElementType();
8401 unsigned VecSize = VecVT.getSizeInBits();
8402 unsigned EltSize = EltVT.getSizeInBits();
8403 SDLoc SL(Op);
8404
8405 // Specially handle the case of v4i16 with static indexing.
8406 unsigned NumElts = VecVT.getVectorNumElements();
8407 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
8408 if (NumElts == 4 && EltSize == 16 && KIdx) {
8409 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
8410
8411 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8412 DAG.getConstant(0, SL, MVT::i32));
8413 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
8414 DAG.getConstant(1, SL, MVT::i32));
8415
8416 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8417 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8418
8419 unsigned Idx = KIdx->getZExtValue();
8420 bool InsertLo = Idx < 2;
8421 SDValue InsHalf = DAG.getNode(
8422 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
8423 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8424 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8425
8426 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8427
8428 SDValue Concat =
8429 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
8430 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8431
8432 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
8433 }
8434
8435 // Static indexing does not lower to stack access, and hence there is no need
8436 // for special custom lowering to avoid stack access.
8437 if (isa<ConstantSDNode>(Idx))
8438 return SDValue();
8439
8440 // Avoid stack access for dynamic indexing by custom lowering to
8441 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
8442
8443 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
8444
8445 MVT IntVT = MVT::getIntegerVT(VecSize);
8446
8447 // Convert vector index to bit-index and get the required bit mask.
8448 assert(isPowerOf2_32(EltSize));
8449 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
8450 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8451 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8452 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
8453 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
8454
8455 // 1. Create a congruent vector with the target value in each element.
8456 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
8457 DAG.getSplatBuildVector(VecVT, SL, InsVal));
8458
8459 // 2. Mask off all other indices except the required index within (1).
8460 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
8461
8462 // 3. Mask off the required index within the target vector.
8463 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8464 SDValue RHS =
8465 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
8466
8467 // 4. Get (2) and (3) ORed into the target vector.
8468 SDValue BFI =
8469 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
8470
8471 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
8472}
8473
8474SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
8475 SelectionDAG &DAG) const {
8476 SDLoc SL(Op);
8477
8478 EVT ResultVT = Op.getValueType();
8479 SDValue Vec = Op.getOperand(0);
8480 SDValue Idx = Op.getOperand(1);
8481 EVT VecVT = Vec.getValueType();
8482 unsigned VecSize = VecVT.getSizeInBits();
8483 EVT EltVT = VecVT.getVectorElementType();
8484
8485 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
8486
8487 // Make sure we do any optimizations that will make it easier to fold
8488 // source modifiers before obscuring it with bit operations.
8489
8490 // XXX - Why doesn't this get called when vector_shuffle is expanded?
8491 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
8492 return Combined;
8493
8494 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8495 SDValue Lo, Hi;
8496 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
8497
8498 if (VecSize == 128) {
8499 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
8500 Lo = DAG.getBitcast(LoVT,
8501 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8502 DAG.getConstant(0, SL, MVT::i32)));
8503 Hi = DAG.getBitcast(HiVT,
8504 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8505 DAG.getConstant(1, SL, MVT::i32)));
8506 } else if (VecSize == 256) {
8507 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
8508 SDValue Parts[4];
8509 for (unsigned P = 0; P < 4; ++P) {
8510 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8511 DAG.getConstant(P, SL, MVT::i32));
8512 }
8513
8514 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8515 Parts[0], Parts[1]));
8516 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
8517 Parts[2], Parts[3]));
8518 } else {
8519 assert(VecSize == 512);
8520
8521 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
8522 SDValue Parts[8];
8523 for (unsigned P = 0; P < 8; ++P) {
8524 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
8525 DAG.getConstant(P, SL, MVT::i32));
8526 }
8527
8528 Lo = DAG.getBitcast(LoVT,
8529 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8530 Parts[0], Parts[1], Parts[2], Parts[3]));
8531 Hi = DAG.getBitcast(HiVT,
8532 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
8533 Parts[4], Parts[5], Parts[6], Parts[7]));
8534 }
8535
8536 EVT IdxVT = Idx.getValueType();
8537 unsigned NElem = VecVT.getVectorNumElements();
8538 assert(isPowerOf2_32(NElem));
8539 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
8540 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
8541 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
8542 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
8543 }
8544
8545 assert(VecSize <= 64);
8546
8547 MVT IntVT = MVT::getIntegerVT(VecSize);
8548
8549 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
8550 SDValue VecBC = peekThroughBitcasts(Vec);
8551 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8552 SDValue Src = VecBC.getOperand(0);
8553 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8554 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
8555 }
8556
8557 unsigned EltSize = EltVT.getSizeInBits();
8558 assert(isPowerOf2_32(EltSize));
8559
8560 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
8561
8562 // Convert vector index to bit-index (* EltSize)
8563 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
8564
8565 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
8566 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
8567
8568 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8569 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
8570 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
8571 }
8572
8573 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
8574}
8575
8576static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
8577 assert(Elt % 2 == 0);
8578 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8579}
8580
8581static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
8582 assert(Elt % 2 == 0);
8583 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8584 !(Mask[Elt + 1] & 1);
8585}
8586
8587SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
8588 SelectionDAG &DAG) const {
8589 SDLoc SL(Op);
8590 EVT ResultVT = Op.getValueType();
8591 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
8592 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
8593 const int NewSrcNumElts = 2;
8594 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
8595 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
8596
8597 // Break up the shuffle into registers sized pieces.
8598 //
8599 // We're trying to form sub-shuffles that the register allocation pipeline
8600 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
8601 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
8602 // pair of copies into a consecutive register copy, so use the ordinary
8603 // extract_vector_elt lowering unless we can use the shuffle.
8604 //
8605 // TODO: This is a bit of hack, and we should probably always use
8606 // extract_subvector for the largest possible subvector we can (or at least
8607 // use it for PackVT aligned pieces). However we have worse support for
8608 // combines on them don't directly treat extract_subvector / insert_subvector
8609 // as legal. The DAG scheduler also ends up doing a worse job with the
8610 // extract_subvectors.
8611 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
8612
8613 // vector_shuffle <0,1,6,7> lhs, rhs
8614 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
8615 //
8616 // vector_shuffle <6,7,2,3> lhs, rhs
8617 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
8618 //
8619 // vector_shuffle <6,7,0,1> lhs, rhs
8620 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
8621
8622 // Avoid scalarizing when both halves are reading from consecutive elements.
8623
8624 // If we're treating 2 element shuffles as legal, also create odd-to-even
8625 // shuffles of neighboring pairs.
8626 //
8627 // vector_shuffle <3,2,7,6> lhs, rhs
8628 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
8629 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
8630
8632 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
8633 if (ShouldUseConsecutiveExtract &&
8635 const int Idx = SVN->getMaskElt(I);
8636 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8637 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8638 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
8639 SVN->getOperand(VecIdx),
8640 DAG.getConstant(EltIdx, SL, MVT::i32));
8641 Pieces.push_back(SubVec);
8642 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
8644 int Idx0 = SVN->getMaskElt(I);
8645 int Idx1 = SVN->getMaskElt(I + 1);
8646
8647 SDValue SrcOp0 = SVN->getOperand(0);
8648 SDValue SrcOp1 = SrcOp0;
8649 if (Idx0 >= SrcNumElts) {
8650 SrcOp0 = SVN->getOperand(1);
8651 Idx0 -= SrcNumElts;
8652 }
8653
8654 if (Idx1 >= SrcNumElts) {
8655 SrcOp1 = SVN->getOperand(1);
8656 Idx1 -= SrcNumElts;
8657 }
8658
8659 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8660 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8661
8662 // Extract nearest even aligned piece.
8663 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
8664 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
8665 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
8666 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
8667
8668 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8669 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8670
8671 SDValue Result0 = SubVec0;
8672 SDValue Result1 = SubVec0;
8673
8674 if (SubVec0 != SubVec1) {
8675 NewMaskIdx1 += NewSrcNumElts;
8676 Result1 = SubVec1;
8677 } else {
8678 Result1 = DAG.getPOISON(PackVT);
8679 }
8680
8681 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
8682 {NewMaskIdx0, NewMaskIdx1});
8683 Pieces.push_back(Shuf);
8684 } else {
8685 const int Idx0 = SVN->getMaskElt(I);
8686 const int Idx1 = SVN->getMaskElt(I + 1);
8687 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8688 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8689 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8690 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8691
8692 SDValue Vec0 = SVN->getOperand(VecIdx0);
8693 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
8694 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
8695
8696 SDValue Vec1 = SVN->getOperand(VecIdx1);
8697 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
8698 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
8699 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
8700 }
8701 }
8702
8703 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
8704}
8705
8706SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
8707 SelectionDAG &DAG) const {
8708 SDValue SVal = Op.getOperand(0);
8709 EVT ResultVT = Op.getValueType();
8710 EVT SValVT = SVal.getValueType();
8711 SDValue UndefVal = DAG.getPOISON(SValVT);
8712 SDLoc SL(Op);
8713
8715 VElts.push_back(SVal);
8716 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
8717 VElts.push_back(UndefVal);
8718
8719 return DAG.getBuildVector(ResultVT, SL, VElts);
8720}
8721
8722SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
8723 SelectionDAG &DAG) const {
8724 SDLoc SL(Op);
8725 EVT VT = Op.getValueType();
8726
8727 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8728 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
8729
8730 SDValue Lo = Op.getOperand(0);
8731 SDValue Hi = Op.getOperand(1);
8732
8733 // Avoid adding defined bits with the zero_extend.
8734 if (Hi.isUndef()) {
8735 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8736 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
8737 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
8738 }
8739
8740 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
8741 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
8742
8743 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
8744 DAG.getConstant(16, SL, MVT::i32));
8745 if (Lo.isUndef())
8746 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
8747
8748 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
8749 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
8750
8751 SDValue Or =
8752 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
8753 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
8754 }
8755
8756 // Split into 2-element chunks.
8757 const unsigned NumParts = VT.getVectorNumElements() / 2;
8758 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
8759 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
8760
8762 for (unsigned P = 0; P < NumParts; ++P) {
8763 SDValue Vec = DAG.getBuildVector(
8764 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
8765 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
8766 }
8767
8768 SDValue Blend =
8769 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
8770 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
8771}
8772
8774 const GlobalAddressSDNode *GA) const {
8775 // OSes that use ELF REL relocations (instead of RELA) can only store a
8776 // 32-bit addend in the instruction, so it is not safe to allow offset folding
8777 // which can create arbitrary 64-bit addends. (This is only a problem for
8778 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
8779 // the high 32 bits of the addend.)
8780 //
8781 // This should be kept in sync with how HasRelocationAddend is initialized in
8782 // the constructor of ELFAMDGPUAsmBackend.
8783 if (!Subtarget->isAmdHsaOS())
8784 return false;
8785
8786 // We can fold offsets for anything that doesn't require a GOT relocation.
8787 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
8791}
8792
8793static SDValue
8795 const SDLoc &DL, int64_t Offset, EVT PtrVT,
8796 unsigned GAFlags = SIInstrInfo::MO_NONE) {
8797 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
8798 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
8799 // lowered to the following code sequence:
8800 //
8801 // For constant address space:
8802 // s_getpc_b64 s[0:1]
8803 // s_add_u32 s0, s0, $symbol
8804 // s_addc_u32 s1, s1, 0
8805 //
8806 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8807 // a fixup or relocation is emitted to replace $symbol with a literal
8808 // constant, which is a pc-relative offset from the encoding of the $symbol
8809 // operand to the global variable.
8810 //
8811 // For global address space:
8812 // s_getpc_b64 s[0:1]
8813 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
8814 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
8815 //
8816 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
8817 // fixups or relocations are emitted to replace $symbol@*@lo and
8818 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
8819 // which is a 64-bit pc-relative offset from the encoding of the $symbol
8820 // operand to the global variable.
8821 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
8822 assert(GAFlags != SIInstrInfo::MO_NONE);
8823
8824 SDValue Ptr =
8825 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
8826 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
8827 }
8828
8829 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
8830 SDValue PtrHi;
8831 if (GAFlags == SIInstrInfo::MO_NONE)
8832 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
8833 else
8834 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
8835 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
8836}
8837
8838SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
8839 SDValue Op,
8840 SelectionDAG &DAG) const {
8841 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
8842 SDLoc DL(GSD);
8843 EVT PtrVT = Op.getValueType();
8844
8845 const GlobalValue *GV = GSD->getGlobal();
8851 GV->hasExternalLinkage()) {
8852 Type *Ty = GV->getValueType();
8853 // HIP uses an unsized array `extern __shared__ T s[]` or similar
8854 // zero-sized type in other languages to declare the dynamic shared
8855 // memory which size is not known at the compile time. They will be
8856 // allocated by the runtime and placed directly after the static
8857 // allocated ones. They all share the same offset.
8858 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
8859 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
8860 // Adjust alignment for that dynamic shared memory array.
8863 MFI->setUsesDynamicLDS(true);
8864 return SDValue(
8865 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
8866 }
8867 }
8869 }
8870
8872 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
8874 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
8875 }
8876
8877 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8878 if (Subtarget->has64BitLiterals()) {
8880 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
8881 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
8882 0);
8883 }
8884
8885 SDValue AddrLo = DAG.getTargetGlobalAddress(
8886 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
8887 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
8888
8889 SDValue AddrHi = DAG.getTargetGlobalAddress(
8890 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
8891 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
8892
8893 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
8894 }
8895
8896 if (shouldEmitFixup(GV))
8897 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
8898
8899 if (shouldEmitPCReloc(GV))
8900 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
8902
8903 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
8905 PointerType *PtrTy =
8907 const DataLayout &DataLayout = DAG.getDataLayout();
8908 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
8909 MachinePointerInfo PtrInfo =
8911
8912 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
8915}
8916
8918 const SDLoc &DL, SDValue V) const {
8919 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
8920 // the destination register.
8921 //
8922 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
8923 // so we will end up with redundant moves to m0.
8924 //
8925 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
8926
8927 // A Null SDValue creates a glue result.
8928 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
8929 V, Chain);
8930 return SDValue(M0, 0);
8931}
8932
8933SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
8934 MVT VT,
8935 unsigned Offset) const {
8936 SDLoc SL(Op);
8937 SDValue Param = lowerKernargMemParameter(
8938 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
8939 // The local size values will have the hi 16-bits as zero.
8940 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
8941 DAG.getValueType(VT));
8942}
8943
8945 EVT VT) {
8948 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
8949 return DAG.getPOISON(VT);
8950}
8951
8953 EVT VT) {
8956 "intrinsic not supported on subtarget", DL.getDebugLoc()));
8957 return DAG.getPOISON(VT);
8958}
8959
8961 ArrayRef<SDValue> Elts) {
8962 assert(!Elts.empty());
8963 MVT Type;
8964 unsigned NumElts = Elts.size();
8965
8966 if (NumElts <= 12) {
8967 Type = MVT::getVectorVT(MVT::f32, NumElts);
8968 } else {
8969 assert(Elts.size() <= 16);
8970 Type = MVT::v16f32;
8971 NumElts = 16;
8972 }
8973
8974 SmallVector<SDValue, 16> VecElts(NumElts);
8975 for (unsigned i = 0; i < Elts.size(); ++i) {
8976 SDValue Elt = Elts[i];
8977 if (Elt.getValueType() != MVT::f32)
8978 Elt = DAG.getBitcast(MVT::f32, Elt);
8979 VecElts[i] = Elt;
8980 }
8981 for (unsigned i = Elts.size(); i < NumElts; ++i)
8982 VecElts[i] = DAG.getPOISON(MVT::f32);
8983
8984 if (NumElts == 1)
8985 return VecElts[0];
8986 return DAG.getBuildVector(Type, DL, VecElts);
8987}
8988
8989static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
8990 SDValue Src, int ExtraElts) {
8991 EVT SrcVT = Src.getValueType();
8992
8994
8995 if (SrcVT.isVector())
8996 DAG.ExtractVectorElements(Src, Elts);
8997 else
8998 Elts.push_back(Src);
8999
9000 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9001 while (ExtraElts--)
9002 Elts.push_back(Undef);
9003
9004 return DAG.getBuildVector(CastVT, DL, Elts);
9005}
9006
9007// Re-construct the required return value for a image load intrinsic.
9008// This is more complicated due to the optional use TexFailCtrl which means the
9009// required return type is an aggregate
9011 ArrayRef<EVT> ResultTypes, bool IsTexFail,
9012 bool Unpacked, bool IsD16, int DMaskPop,
9013 int NumVDataDwords, bool IsAtomicPacked16Bit,
9014 const SDLoc &DL) {
9015 // Determine the required return type. This is the same regardless of
9016 // IsTexFail flag
9017 EVT ReqRetVT = ResultTypes[0];
9018 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
9019 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9020 ? (ReqRetNumElts + 1) / 2
9021 : ReqRetNumElts;
9022
9023 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9024
9025 MVT DataDwordVT =
9026 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
9027
9028 MVT MaskPopVT =
9029 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
9030
9031 SDValue Data(Result, 0);
9032 SDValue TexFail;
9033
9034 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
9035 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
9036 if (MaskPopVT.isVector()) {
9037 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
9038 SDValue(Result, 0), ZeroIdx);
9039 } else {
9040 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
9041 SDValue(Result, 0), ZeroIdx);
9042 }
9043 }
9044
9045 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
9046 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
9047 NumDataDwords - MaskPopDwords);
9048
9049 if (IsD16)
9050 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
9051
9052 EVT LegalReqRetVT = ReqRetVT;
9053 if (!ReqRetVT.isVector()) {
9054 if (!Data.getValueType().isInteger())
9055 Data = DAG.getNode(ISD::BITCAST, DL,
9056 Data.getValueType().changeTypeToInteger(), Data);
9057 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
9058 } else {
9059 // We need to widen the return vector to a legal type
9060 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
9061 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
9062 LegalReqRetVT =
9064 ReqRetVT.getVectorNumElements() + 1);
9065 }
9066 }
9067 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
9068
9069 if (IsTexFail) {
9070 TexFail =
9071 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
9072 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
9073
9074 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
9075 }
9076
9077 if (Result->getNumValues() == 1)
9078 return Data;
9079
9080 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
9081}
9082
9083static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
9084 SDValue *LWE, bool &IsTexFail) {
9085 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
9086
9087 uint64_t Value = TexFailCtrlConst->getZExtValue();
9088 if (Value) {
9089 IsTexFail = true;
9090 }
9091
9092 SDLoc DL(TexFailCtrlConst);
9093 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
9094 Value &= ~(uint64_t)0x1;
9095 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
9096 Value &= ~(uint64_t)0x2;
9097
9098 return Value == 0;
9099}
9100
9102 MVT PackVectorVT,
9103 SmallVectorImpl<SDValue> &PackedAddrs,
9104 unsigned DimIdx, unsigned EndIdx,
9105 unsigned NumGradients) {
9106 SDLoc DL(Op);
9107 for (unsigned I = DimIdx; I < EndIdx; I++) {
9108 SDValue Addr = Op.getOperand(I);
9109
9110 // Gradients are packed with undef for each coordinate.
9111 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
9112 // 1D: undef,dx/dh; undef,dx/dv
9113 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
9114 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
9115 if (((I + 1) >= EndIdx) ||
9116 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
9117 I == DimIdx + NumGradients - 1))) {
9118 if (Addr.getValueType() != MVT::i16)
9119 Addr = DAG.getBitcast(MVT::i16, Addr);
9120 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
9121 } else {
9122 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
9123 I++;
9124 }
9125 Addr = DAG.getBitcast(MVT::f32, Addr);
9126 PackedAddrs.push_back(Addr);
9127 }
9128}
9129
9130SDValue SITargetLowering::lowerImage(SDValue Op,
9132 SelectionDAG &DAG, bool WithChain) const {
9133 SDLoc DL(Op);
9134 MachineFunction &MF = DAG.getMachineFunction();
9135 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
9136 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9138 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
9139 unsigned IntrOpcode = Intr->BaseOpcode;
9140 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
9141 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9142 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9143
9144 SmallVector<EVT, 3> ResultTypes(Op->values());
9145 SmallVector<EVT, 3> OrigResultTypes(Op->values());
9146 bool IsD16 = false;
9147 bool IsG16 = false;
9148 bool IsA16 = false;
9149 SDValue VData;
9150 int NumVDataDwords = 0;
9151 bool AdjustRetType = false;
9152 bool IsAtomicPacked16Bit = false;
9153
9154 // Offset of intrinsic arguments
9155 const unsigned ArgOffset = WithChain ? 2 : 1;
9156
9157 unsigned DMask;
9158 unsigned DMaskLanes = 0;
9159
9160 if (BaseOpcode->Atomic) {
9161 VData = Op.getOperand(2);
9162
9163 IsAtomicPacked16Bit =
9164 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9165 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
9166
9167 bool Is64Bit = VData.getValueSizeInBits() == 64;
9168 if (BaseOpcode->AtomicX2) {
9169 SDValue VData2 = Op.getOperand(3);
9170 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
9171 {VData, VData2});
9172 if (Is64Bit)
9173 VData = DAG.getBitcast(MVT::v4i32, VData);
9174
9175 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9176 DMask = Is64Bit ? 0xf : 0x3;
9177 NumVDataDwords = Is64Bit ? 4 : 2;
9178 } else {
9179 DMask = Is64Bit ? 0x3 : 0x1;
9180 NumVDataDwords = Is64Bit ? 2 : 1;
9181 }
9182 } else {
9183 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
9184 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
9185
9186 if (BaseOpcode->Store) {
9187 VData = Op.getOperand(2);
9188
9189 MVT StoreVT = VData.getSimpleValueType();
9190 if (StoreVT.getScalarType() == MVT::f16) {
9191 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9192 return Op; // D16 is unsupported for this instruction
9193
9194 IsD16 = true;
9195 VData = handleD16VData(VData, DAG, true);
9196 }
9197
9198 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
9199 } else if (!BaseOpcode->NoReturn) {
9200 // Work out the num dwords based on the dmask popcount and underlying type
9201 // and whether packing is supported.
9202 MVT LoadVT = ResultTypes[0].getSimpleVT();
9203 if (LoadVT.getScalarType() == MVT::f16) {
9204 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
9205 return Op; // D16 is unsupported for this instruction
9206
9207 IsD16 = true;
9208 }
9209
9210 // Confirm that the return type is large enough for the dmask specified
9211 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
9212 (!LoadVT.isVector() && DMaskLanes > 1))
9213 return Op;
9214
9215 // The sq block of gfx8 and gfx9 do not estimate register use correctly
9216 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
9217 // instructions.
9218 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9219 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
9220 NumVDataDwords = (DMaskLanes + 1) / 2;
9221 else
9222 NumVDataDwords = DMaskLanes;
9223
9224 AdjustRetType = true;
9225 }
9226 }
9227
9228 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
9230
9231 // Check for 16 bit addresses or derivatives and pack if true.
9232 MVT VAddrVT =
9233 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
9234 MVT VAddrScalarVT = VAddrVT.getScalarType();
9235 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9236 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9237
9238 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
9239 VAddrScalarVT = VAddrVT.getScalarType();
9240 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9241 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9242
9243 // Push back extra arguments.
9244 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
9245 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
9246 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
9247 // Special handling of bias when A16 is on. Bias is of type half but
9248 // occupies full 32-bit.
9249 SDValue Bias = DAG.getBuildVector(
9250 MVT::v2f16, DL,
9251 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
9252 VAddrs.push_back(Bias);
9253 } else {
9254 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
9255 "Bias needs to be converted to 16 bit in A16 mode");
9256 VAddrs.push_back(Op.getOperand(ArgOffset + I));
9257 }
9258 }
9259
9260 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
9261 // 16 bit gradients are supported, but are tied to the A16 control
9262 // so both gradients and addresses must be 16 bit
9263 LLVM_DEBUG(
9264 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
9265 "require 16 bit args for both gradients and addresses");
9266 return Op;
9267 }
9268
9269 if (IsA16) {
9270 if (!ST->hasA16()) {
9271 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
9272 "support 16 bit addresses\n");
9273 return Op;
9274 }
9275 }
9276
9277 // We've dealt with incorrect input so we know that if IsA16, IsG16
9278 // are set then we have to compress/pack operands (either address,
9279 // gradient or both)
9280 // In the case where a16 and gradients are tied (no G16 support) then we
9281 // have already verified that both IsA16 and IsG16 are true
9282 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
9283 // Activate g16
9284 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9286 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
9287 }
9288
9289 // Add gradients (packed or unpacked)
9290 if (IsG16) {
9291 // Pack the gradients
9292 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
9293 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
9294 ArgOffset + Intr->GradientStart,
9295 ArgOffset + Intr->CoordStart, Intr->NumGradients);
9296 } else {
9297 for (unsigned I = ArgOffset + Intr->GradientStart;
9298 I < ArgOffset + Intr->CoordStart; I++)
9299 VAddrs.push_back(Op.getOperand(I));
9300 }
9301
9302 // Add addresses (packed or unpacked)
9303 if (IsA16) {
9304 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
9305 ArgOffset + Intr->CoordStart, VAddrEnd,
9306 0 /* No gradients */);
9307 } else {
9308 // Add uncompressed address
9309 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
9310 VAddrs.push_back(Op.getOperand(I));
9311 }
9312
9313 // If the register allocator cannot place the address registers contiguously
9314 // without introducing moves, then using the non-sequential address encoding
9315 // is always preferable, since it saves VALU instructions and is usually a
9316 // wash in terms of code size or even better.
9317 //
9318 // However, we currently have no way of hinting to the register allocator that
9319 // MIMG addresses should be placed contiguously when it is possible to do so,
9320 // so force non-NSA for the common 2-address case as a heuristic.
9321 //
9322 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
9323 // allocation when possible.
9324 //
9325 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
9326 // set of the remaining addresses.
9327 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
9328 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
9329 const bool UseNSA = ST->hasNSAEncoding() &&
9330 VAddrs.size() >= ST->getNSAThreshold(MF) &&
9331 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
9332 const bool UsePartialNSA =
9333 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
9334
9335 SDValue VAddr;
9336 if (UsePartialNSA) {
9337 VAddr = getBuildDwordsVector(DAG, DL,
9338 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9339 } else if (!UseNSA) {
9340 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
9341 }
9342
9343 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
9344 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
9345 SDValue Unorm;
9346 if (!BaseOpcode->Sampler) {
9347 Unorm = True;
9348 } else {
9349 uint64_t UnormConst =
9350 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
9351
9352 Unorm = UnormConst ? True : False;
9353 }
9354
9355 SDValue TFE;
9356 SDValue LWE;
9357 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
9358 bool IsTexFail = false;
9359 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9360 return Op;
9361
9362 if (IsTexFail) {
9363 if (!DMaskLanes) {
9364 // Expecting to get an error flag since TFC is on - and dmask is 0
9365 // Force dmask to be at least 1 otherwise the instruction will fail
9366 DMask = 0x1;
9367 DMaskLanes = 1;
9368 NumVDataDwords = 1;
9369 }
9370 NumVDataDwords += 1;
9371 AdjustRetType = true;
9372 }
9373
9374 // Has something earlier tagged that the return type needs adjusting
9375 // This happens if the instruction is a load or has set TexFailCtrl flags
9376 if (AdjustRetType) {
9377 // NumVDataDwords reflects the true number of dwords required in the return
9378 // type
9379 if (DMaskLanes == 0 && !BaseOpcode->Store) {
9380 // This is a no-op load. This can be eliminated
9381 SDValue Undef = DAG.getPOISON(Op.getValueType());
9382 if (isa<MemSDNode>(Op))
9383 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
9384 return Undef;
9385 }
9386
9387 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
9388 MVT::i32, NumVDataDwords)
9389 : MVT::i32;
9390
9391 ResultTypes[0] = NewVT;
9392 if (ResultTypes.size() == 3) {
9393 // Original result was aggregate type used for TexFailCtrl results
9394 // The actual instruction returns as a vector type which has now been
9395 // created. Remove the aggregate result.
9396 ResultTypes.erase(&ResultTypes[1]);
9397 }
9398 }
9399
9400 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
9401 if (BaseOpcode->Atomic)
9402 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
9403 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
9405 return Op;
9406
9408 if (BaseOpcode->Store || BaseOpcode->Atomic)
9409 Ops.push_back(VData); // vdata
9410 if (UsePartialNSA) {
9411 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
9412 Ops.push_back(VAddr);
9413 } else if (UseNSA)
9414 append_range(Ops, VAddrs);
9415 else
9416 Ops.push_back(VAddr);
9417 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
9418 EVT RsrcVT = Rsrc.getValueType();
9419 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9420 return Op;
9421 Ops.push_back(Rsrc);
9422 if (BaseOpcode->Sampler) {
9423 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
9424 if (Samp.getValueType() != MVT::v4i32)
9425 return Op;
9426 Ops.push_back(Samp);
9427 }
9428 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
9429 if (IsGFX10Plus)
9430 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
9431 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9432 Ops.push_back(Unorm);
9433 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
9434 Ops.push_back(IsA16 && // r128, a16 for gfx9
9435 ST->hasFeature(AMDGPU::FeatureR128A16)
9436 ? True
9437 : False);
9438 if (IsGFX10Plus)
9439 Ops.push_back(IsA16 ? True : False);
9440
9441 if (!Subtarget->hasGFX90AInsts())
9442 Ops.push_back(TFE); // tfe
9443 else if (TFE->getAsZExtVal()) {
9444 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9446 "TFE is not supported on this GPU", DL.getDebugLoc()));
9447 }
9448
9449 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
9450 Ops.push_back(LWE); // lwe
9451 if (!IsGFX10Plus)
9452 Ops.push_back(DimInfo->DA ? True : False);
9453 if (BaseOpcode->HasD16)
9454 Ops.push_back(IsD16 ? True : False);
9455 if (isa<MemSDNode>(Op))
9456 Ops.push_back(Op.getOperand(0)); // chain
9457
9458 int NumVAddrDwords =
9459 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
9460 int Opcode = -1;
9461
9462 if (IsGFX12Plus) {
9463 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
9464 NumVDataDwords, NumVAddrDwords);
9465 } else if (IsGFX11Plus) {
9466 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9467 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9468 : AMDGPU::MIMGEncGfx11Default,
9469 NumVDataDwords, NumVAddrDwords);
9470 } else if (IsGFX10Plus) {
9471 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
9472 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9473 : AMDGPU::MIMGEncGfx10Default,
9474 NumVDataDwords, NumVAddrDwords);
9475 } else {
9476 if (Subtarget->hasGFX90AInsts()) {
9477 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
9478 NumVDataDwords, NumVAddrDwords);
9479 if (Opcode == -1) {
9480 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9482 "requested image instruction is not supported on this GPU",
9483 DL.getDebugLoc()));
9484
9485 unsigned Idx = 0;
9486 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
9487 for (EVT VT : OrigResultTypes) {
9488 if (VT == MVT::Other)
9489 RetValues[Idx++] = Op.getOperand(0); // Chain
9490 else
9491 RetValues[Idx++] = DAG.getPOISON(VT);
9492 }
9493
9494 return DAG.getMergeValues(RetValues, DL);
9495 }
9496 }
9497 if (Opcode == -1 &&
9498 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9499 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
9500 NumVDataDwords, NumVAddrDwords);
9501 if (Opcode == -1)
9502 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
9503 NumVDataDwords, NumVAddrDwords);
9504 }
9505 if (Opcode == -1)
9506 return Op;
9507
9508 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
9509 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
9510 MachineMemOperand *MemRef = MemOp->getMemOperand();
9511 DAG.setNodeMemRefs(NewNode, {MemRef});
9512 }
9513
9514 if (BaseOpcode->AtomicX2) {
9516 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
9517 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
9518 }
9519 if (BaseOpcode->NoReturn)
9520 return SDValue(NewNode, 0);
9521 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
9522 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9523 NumVDataDwords, IsAtomicPacked16Bit, DL);
9524}
9525
9526SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
9527 SDValue Offset, SDValue CachePolicy,
9528 SelectionDAG &DAG) const {
9529 MachineFunction &MF = DAG.getMachineFunction();
9530
9531 const DataLayout &DataLayout = DAG.getDataLayout();
9532 Align Alignment =
9533 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
9534
9535 MachineMemOperand *MMO = MF.getMachineMemOperand(
9536 MachinePointerInfo(),
9539 VT.getStoreSize(), Alignment);
9540
9541 if (!Offset->isDivergent()) {
9542 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
9543
9544 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
9545 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
9546 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
9547 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
9548 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9549 SDValue BufferLoad =
9551 DAG.getVTList(MVT::i32), Ops, VT, MMO);
9552 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
9553 }
9554
9555 // Widen vec3 load to vec4.
9556 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
9557 !Subtarget->hasScalarDwordx3Loads()) {
9558 EVT WidenedVT =
9560 auto WidenedOp = DAG.getMemIntrinsicNode(
9561 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
9562 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
9563 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
9564 DAG.getVectorIdxConstant(0, DL));
9565 return Subvector;
9566 }
9567
9569 DAG.getVTList(VT), Ops, VT, MMO);
9570 }
9571
9572 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
9573 // assume that the buffer is unswizzled.
9574 SDValue Ops[] = {
9575 DAG.getEntryNode(), // Chain
9576 Rsrc, // rsrc
9577 DAG.getConstant(0, DL, MVT::i32), // vindex
9578 {}, // voffset
9579 {}, // soffset
9580 {}, // offset
9581 CachePolicy, // cachepolicy
9582 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9583 };
9584 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9585 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
9586 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
9587 }
9588
9590 unsigned NumLoads = 1;
9591 MVT LoadVT = VT.getSimpleVT();
9592 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
9593 assert((LoadVT.getScalarType() == MVT::i32 ||
9594 LoadVT.getScalarType() == MVT::f32));
9595
9596 if (NumElts == 8 || NumElts == 16) {
9597 NumLoads = NumElts / 4;
9598 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
9599 }
9600
9601 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
9602
9603 // Use the alignment to ensure that the required offsets will fit into the
9604 // immediate offsets.
9605 setBufferOffsets(Offset, DAG, &Ops[3],
9606 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
9607
9608 uint64_t InstOffset = Ops[5]->getAsZExtVal();
9609 for (unsigned i = 0; i < NumLoads; ++i) {
9610 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
9611 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
9612 LoadVT, MMO, DAG));
9613 }
9614
9615 if (NumElts == 8 || NumElts == 16)
9616 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
9617
9618 return Loads[0];
9619}
9620
9621SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
9622 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
9623 if (!Subtarget->hasArchitectedSGPRs())
9624 return {};
9625 SDLoc SL(Op);
9626 MVT VT = MVT::i32;
9627 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
9628 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9629 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
9630}
9631
9632SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
9633 AMDGPU::Hwreg::Id HwReg,
9634 unsigned LowBit,
9635 unsigned Width) const {
9636 SDLoc SL(Op);
9637 using namespace AMDGPU::Hwreg;
9638 return {DAG.getMachineNode(
9639 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9640 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
9641 SL, MVT::i32)),
9642 0};
9643}
9644
9645SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
9646 unsigned Dim,
9647 const ArgDescriptor &Arg) const {
9648 SDLoc SL(Op);
9649 MachineFunction &MF = DAG.getMachineFunction();
9650 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
9651 if (MaxID == 0)
9652 return DAG.getConstant(0, SL, MVT::i32);
9653
9654 // It's undefined behavior if a function marked with the amdgpu-no-*
9655 // attributes uses the corresponding intrinsic.
9656 if (!Arg)
9657 return DAG.getPOISON(Op->getValueType(0));
9658
9659 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
9660 SDLoc(DAG.getEntryNode()), Arg);
9661
9662 // Don't bother inserting AssertZext for packed IDs since we're emitting the
9663 // masking operations anyway.
9664 //
9665 // TODO: We could assert the top bit is 0 for the source copy.
9666 if (Arg.isMasked())
9667 return Val;
9668
9669 // Preserve the known bits after expansion to a copy.
9670 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
9671 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
9672 DAG.getValueType(SmallVT));
9673}
9674
9675SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
9676 SelectionDAG &DAG) const {
9677 MachineFunction &MF = DAG.getMachineFunction();
9678 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
9679
9680 EVT VT = Op.getValueType();
9681 SDLoc DL(Op);
9682 unsigned IntrinsicID = Op.getConstantOperandVal(0);
9683
9684 // TODO: Should this propagate fast-math-flags?
9685
9686 switch (IntrinsicID) {
9687 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9688 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
9689 return emitNonHSAIntrinsicError(DAG, DL, VT);
9690 return getPreloadedValue(DAG, *MFI, VT,
9692 }
9693 case Intrinsic::amdgcn_dispatch_ptr:
9694 case Intrinsic::amdgcn_queue_ptr: {
9695 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
9696 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9697 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
9698 DL.getDebugLoc()));
9699 return DAG.getPOISON(VT);
9700 }
9701
9702 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9705 return getPreloadedValue(DAG, *MFI, VT, RegID);
9706 }
9707 case Intrinsic::amdgcn_implicitarg_ptr: {
9708 if (MFI->isEntryFunction())
9709 return getImplicitArgPtr(DAG, DL);
9710 return getPreloadedValue(DAG, *MFI, VT,
9712 }
9713 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9715 // This only makes sense to call in a kernel, so just lower to null.
9716 return DAG.getConstant(0, DL, VT);
9717 }
9718
9719 return getPreloadedValue(DAG, *MFI, VT,
9721 }
9722 case Intrinsic::amdgcn_dispatch_id: {
9723 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
9724 }
9725 case Intrinsic::amdgcn_rcp:
9726 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
9727 case Intrinsic::amdgcn_rsq:
9728 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9729 case Intrinsic::amdgcn_rsq_legacy:
9730 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9731 return emitRemovedIntrinsicError(DAG, DL, VT);
9732 return SDValue();
9733 case Intrinsic::amdgcn_rcp_legacy:
9734 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
9735 return emitRemovedIntrinsicError(DAG, DL, VT);
9736 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
9737 case Intrinsic::amdgcn_rsq_clamp: {
9738 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9739 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
9740
9741 Type *Type = VT.getTypeForEVT(*DAG.getContext());
9742 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
9743 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
9744
9745 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
9746 SDValue Tmp =
9747 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
9748 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
9749 DAG.getConstantFP(Min, DL, VT));
9750 }
9751 case Intrinsic::r600_read_ngroups_x:
9752 if (Subtarget->isAmdHsaOS())
9753 return emitNonHSAIntrinsicError(DAG, DL, VT);
9754
9755 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9757 false);
9758 case Intrinsic::r600_read_ngroups_y:
9759 if (Subtarget->isAmdHsaOS())
9760 return emitNonHSAIntrinsicError(DAG, DL, VT);
9761
9762 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9764 false);
9765 case Intrinsic::r600_read_ngroups_z:
9766 if (Subtarget->isAmdHsaOS())
9767 return emitNonHSAIntrinsicError(DAG, DL, VT);
9768
9769 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
9771 false);
9772 case Intrinsic::r600_read_local_size_x:
9773 if (Subtarget->isAmdHsaOS())
9774 return emitNonHSAIntrinsicError(DAG, DL, VT);
9775
9776 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9778 case Intrinsic::r600_read_local_size_y:
9779 if (Subtarget->isAmdHsaOS())
9780 return emitNonHSAIntrinsicError(DAG, DL, VT);
9781
9782 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9784 case Intrinsic::r600_read_local_size_z:
9785 if (Subtarget->isAmdHsaOS())
9786 return emitNonHSAIntrinsicError(DAG, DL, VT);
9787
9788 return lowerImplicitZextParam(DAG, Op, MVT::i16,
9790 case Intrinsic::amdgcn_workgroup_id_x:
9791 return lowerWorkGroupId(DAG, *MFI, VT,
9795 case Intrinsic::amdgcn_workgroup_id_y:
9796 return lowerWorkGroupId(DAG, *MFI, VT,
9800 case Intrinsic::amdgcn_workgroup_id_z:
9801 return lowerWorkGroupId(DAG, *MFI, VT,
9805 case Intrinsic::amdgcn_cluster_id_x:
9806 return Subtarget->hasClusters()
9807 ? getPreloadedValue(DAG, *MFI, VT,
9809 : DAG.getPOISON(VT);
9810 case Intrinsic::amdgcn_cluster_id_y:
9811 return Subtarget->hasClusters()
9812 ? getPreloadedValue(DAG, *MFI, VT,
9814 : DAG.getPOISON(VT);
9815 case Intrinsic::amdgcn_cluster_id_z:
9816 return Subtarget->hasClusters()
9817 ? getPreloadedValue(DAG, *MFI, VT,
9819 : DAG.getPOISON(VT);
9820 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9821 return Subtarget->hasClusters()
9822 ? getPreloadedValue(
9823 DAG, *MFI, VT,
9825 : DAG.getPOISON(VT);
9826 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9827 return Subtarget->hasClusters()
9828 ? getPreloadedValue(
9829 DAG, *MFI, VT,
9831 : DAG.getPOISON(VT);
9832 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9833 return Subtarget->hasClusters()
9834 ? getPreloadedValue(
9835 DAG, *MFI, VT,
9837 : DAG.getPOISON(VT);
9838 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9839 return Subtarget->hasClusters()
9840 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
9841 : SDValue();
9842 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9843 return Subtarget->hasClusters()
9844 ? getPreloadedValue(
9845 DAG, *MFI, VT,
9847 : DAG.getPOISON(VT);
9848 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9849 return Subtarget->hasClusters()
9850 ? getPreloadedValue(
9851 DAG, *MFI, VT,
9853 : DAG.getPOISON(VT);
9854 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9855 return Subtarget->hasClusters()
9856 ? getPreloadedValue(
9857 DAG, *MFI, VT,
9859 : DAG.getPOISON(VT);
9860 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9861 return Subtarget->hasClusters()
9862 ? getPreloadedValue(
9863 DAG, *MFI, VT,
9865 : DAG.getPOISON(VT);
9866 case Intrinsic::amdgcn_wave_id:
9867 return lowerWaveID(DAG, Op);
9868 case Intrinsic::amdgcn_lds_kernel_id: {
9869 if (MFI->isEntryFunction())
9870 return getLDSKernelId(DAG, DL);
9871 return getPreloadedValue(DAG, *MFI, VT,
9873 }
9874 case Intrinsic::amdgcn_workitem_id_x:
9875 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
9876 case Intrinsic::amdgcn_workitem_id_y:
9877 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
9878 case Intrinsic::amdgcn_workitem_id_z:
9879 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
9880 case Intrinsic::amdgcn_wavefrontsize:
9881 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
9882 SDLoc(Op), MVT::i32);
9883 case Intrinsic::amdgcn_s_buffer_load: {
9884 unsigned CPol = Op.getConstantOperandVal(3);
9885 // s_buffer_load, because of how it's optimized, can't be volatile
9886 // so reject ones with the volatile bit set.
9887 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
9890 return Op;
9891 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
9892 Op.getOperand(3), DAG);
9893 }
9894 case Intrinsic::amdgcn_fdiv_fast:
9895 return lowerFDIV_FAST(Op, DAG);
9896 case Intrinsic::amdgcn_sin:
9897 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
9898
9899 case Intrinsic::amdgcn_cos:
9900 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
9901
9902 case Intrinsic::amdgcn_mul_u24:
9903 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
9904 Op.getOperand(2));
9905 case Intrinsic::amdgcn_mul_i24:
9906 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
9907 Op.getOperand(2));
9908
9909 case Intrinsic::amdgcn_log_clamp: {
9910 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
9911 return SDValue();
9912
9913 return emitRemovedIntrinsicError(DAG, DL, VT);
9914 }
9915 case Intrinsic::amdgcn_fract:
9916 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
9917
9918 case Intrinsic::amdgcn_class:
9919 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
9920 Op.getOperand(2));
9921 case Intrinsic::amdgcn_div_fmas:
9922 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
9923 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9924
9925 case Intrinsic::amdgcn_div_fixup:
9926 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
9927 Op.getOperand(2), Op.getOperand(3));
9928
9929 case Intrinsic::amdgcn_div_scale: {
9930 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
9931
9932 // Translate to the operands expected by the machine instruction. The
9933 // first parameter must be the same as the first instruction.
9934 SDValue Numerator = Op.getOperand(1);
9935 SDValue Denominator = Op.getOperand(2);
9936
9937 // Note this order is opposite of the machine instruction's operations,
9938 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
9939 // intrinsic has the numerator as the first operand to match a normal
9940 // division operation.
9941
9942 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
9943
9944 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
9945 Denominator, Numerator);
9946 }
9947 case Intrinsic::amdgcn_icmp: {
9948 // There is a Pat that handles this variant, so return it as-is.
9949 if (Op.getOperand(1).getValueType() == MVT::i1 &&
9950 Op.getConstantOperandVal(2) == 0 &&
9951 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
9952 return Op;
9953 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
9954 }
9955 case Intrinsic::amdgcn_fcmp: {
9956 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
9957 }
9958 case Intrinsic::amdgcn_ballot:
9959 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
9960 case Intrinsic::amdgcn_fmed3:
9961 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
9962 Op.getOperand(2), Op.getOperand(3));
9963 case Intrinsic::amdgcn_fdot2:
9964 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
9965 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
9966 case Intrinsic::amdgcn_fmul_legacy:
9967 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
9968 Op.getOperand(2));
9969 case Intrinsic::amdgcn_sffbh:
9970 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
9971 case Intrinsic::amdgcn_sbfe:
9972 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
9973 Op.getOperand(2), Op.getOperand(3));
9974 case Intrinsic::amdgcn_ubfe:
9975 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
9976 Op.getOperand(2), Op.getOperand(3));
9977 case Intrinsic::amdgcn_cvt_pkrtz:
9978 case Intrinsic::amdgcn_cvt_pknorm_i16:
9979 case Intrinsic::amdgcn_cvt_pknorm_u16:
9980 case Intrinsic::amdgcn_cvt_pk_i16:
9981 case Intrinsic::amdgcn_cvt_pk_u16: {
9982 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
9983 EVT VT = Op.getValueType();
9984 unsigned Opcode;
9985
9986 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
9988 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
9990 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
9992 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
9994 else
9996
9997 if (isTypeLegal(VT))
9998 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
9999
10000 SDValue Node =
10001 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
10002 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
10003 }
10004 case Intrinsic::amdgcn_fmad_ftz:
10005 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
10006 Op.getOperand(2), Op.getOperand(3));
10007
10008 case Intrinsic::amdgcn_if_break:
10009 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
10010 Op->getOperand(1), Op->getOperand(2)),
10011 0);
10012
10013 case Intrinsic::amdgcn_groupstaticsize: {
10015 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
10016 return Op;
10017
10018 const Module *M = MF.getFunction().getParent();
10019 const GlobalValue *GV =
10020 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
10021 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
10023 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10024 }
10025 case Intrinsic::amdgcn_is_shared:
10026 case Intrinsic::amdgcn_is_private: {
10027 SDLoc SL(Op);
10028 SDValue SrcVec =
10029 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10030 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
10031 DAG.getConstant(1, SL, MVT::i32));
10032
10033 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10035 : AMDGPUAS::PRIVATE_ADDRESS;
10036 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
10037 Subtarget->hasGloballyAddressableScratch()) {
10038 SDValue FlatScratchBaseHi(
10039 DAG.getMachineNode(
10040 AMDGPU::S_MOV_B32, DL, MVT::i32,
10041 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10042 0);
10043 // Test bits 63..58 against the aperture address.
10044 return DAG.getSetCC(
10045 SL, MVT::i1,
10046 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
10047 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
10048 }
10049
10050 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10051 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
10052 }
10053 case Intrinsic::amdgcn_perm:
10054 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
10055 Op.getOperand(2), Op.getOperand(3));
10056 case Intrinsic::amdgcn_reloc_constant: {
10057 Module *M = MF.getFunction().getParent();
10058 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
10059 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
10060 auto *RelocSymbol = cast<GlobalVariable>(
10061 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
10062 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
10064 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
10065 }
10066 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10067 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10068 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10069 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10070 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10071 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10072 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10073 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10074 if (Op.getOperand(4).getValueType() == MVT::i32)
10075 return SDValue();
10076
10077 SDLoc SL(Op);
10078 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
10079 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10080 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10081 Op.getOperand(3), IndexKeyi32);
10082 }
10083 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10084 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10085 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10086 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10087 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10088 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10089 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10090 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10091 if (Op.getOperand(4).getValueType() == MVT::i64)
10092 return SDValue();
10093
10094 SDLoc SL(Op);
10095 auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
10096 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10097 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10098 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10099 Op.getOperand(6)});
10100 }
10101 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10102 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10103 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10104 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10105 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10106 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10107 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10108 ? MVT::i64
10109 : MVT::i32;
10110 if (Op.getOperand(6).getValueType() == IndexKeyTy)
10111 return SDValue();
10112
10113 SDLoc SL(Op);
10114 auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
10115 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10116 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10117 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10118 IndexKey, Op.getOperand(7),
10119 Op.getOperand(8)}); // No clamp operand
10120 }
10121 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10122 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10123 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10124 if (Op.getOperand(6).getValueType() == MVT::i32)
10125 return SDValue();
10126
10127 SDLoc SL(Op);
10128 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
10129 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
10130 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10131 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10132 IndexKeyi32, Op.getOperand(7)});
10133 }
10134 case Intrinsic::amdgcn_addrspacecast_nonnull:
10135 return lowerADDRSPACECAST(Op, DAG);
10136 case Intrinsic::amdgcn_readlane:
10137 case Intrinsic::amdgcn_readfirstlane:
10138 case Intrinsic::amdgcn_writelane:
10139 case Intrinsic::amdgcn_permlane16:
10140 case Intrinsic::amdgcn_permlanex16:
10141 case Intrinsic::amdgcn_permlane64:
10142 case Intrinsic::amdgcn_set_inactive:
10143 case Intrinsic::amdgcn_set_inactive_chain_arg:
10144 case Intrinsic::amdgcn_mov_dpp8:
10145 case Intrinsic::amdgcn_update_dpp:
10146 return lowerLaneOp(*this, Op.getNode(), DAG);
10147 case Intrinsic::amdgcn_dead: {
10149 for (const EVT ValTy : Op.getNode()->values())
10150 Poisons.push_back(DAG.getPOISON(ValTy));
10151 return DAG.getMergeValues(Poisons, SDLoc(Op));
10152 }
10153 default:
10154 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10156 return lowerImage(Op, ImageDimIntr, DAG, false);
10157
10158 return Op;
10159 }
10160}
10161
10162// On targets not supporting constant in soffset field, turn zero to
10163// SGPR_NULL to avoid generating an extra s_mov with zero.
10165 const GCNSubtarget *Subtarget) {
10166 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
10167 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10168 return SOffset;
10169}
10170
10171SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
10172 SelectionDAG &DAG,
10173 unsigned NewOpcode) const {
10174 SDLoc DL(Op);
10175
10176 SDValue VData = Op.getOperand(2);
10177 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10178 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10179 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10180 SDValue Ops[] = {
10181 Op.getOperand(0), // Chain
10182 VData, // vdata
10183 Rsrc, // rsrc
10184 DAG.getConstant(0, DL, MVT::i32), // vindex
10185 VOffset, // voffset
10186 SOffset, // soffset
10187 Offset, // offset
10188 Op.getOperand(6), // cachepolicy
10189 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10190 };
10191
10192 auto *M = cast<MemSDNode>(Op);
10193
10194 EVT MemVT = VData.getValueType();
10195 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10196 M->getMemOperand());
10197}
10198
10199SDValue
10200SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
10201 unsigned NewOpcode) const {
10202 SDLoc DL(Op);
10203
10204 SDValue VData = Op.getOperand(2);
10205 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10206 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10207 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10208 SDValue Ops[] = {
10209 Op.getOperand(0), // Chain
10210 VData, // vdata
10211 Rsrc, // rsrc
10212 Op.getOperand(4), // vindex
10213 VOffset, // voffset
10214 SOffset, // soffset
10215 Offset, // offset
10216 Op.getOperand(7), // cachepolicy
10217 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10218 };
10219
10220 auto *M = cast<MemSDNode>(Op);
10221
10222 EVT MemVT = VData.getValueType();
10223 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
10224 M->getMemOperand());
10225}
10226
10227SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
10228 SelectionDAG &DAG) const {
10229 unsigned IntrID = Op.getConstantOperandVal(1);
10230 SDLoc DL(Op);
10231
10232 switch (IntrID) {
10233 case Intrinsic::amdgcn_ds_ordered_add:
10234 case Intrinsic::amdgcn_ds_ordered_swap: {
10235 MemSDNode *M = cast<MemSDNode>(Op);
10236 SDValue Chain = M->getOperand(0);
10237 SDValue M0 = M->getOperand(2);
10238 SDValue Value = M->getOperand(3);
10239 unsigned IndexOperand = M->getConstantOperandVal(7);
10240 unsigned WaveRelease = M->getConstantOperandVal(8);
10241 unsigned WaveDone = M->getConstantOperandVal(9);
10242
10243 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10244 IndexOperand &= ~0x3f;
10245 unsigned CountDw = 0;
10246
10247 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
10248 CountDw = (IndexOperand >> 24) & 0xf;
10249 IndexOperand &= ~(0xf << 24);
10250
10251 if (CountDw < 1 || CountDw > 4) {
10252 const Function &Fn = DAG.getMachineFunction().getFunction();
10253 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10254 Fn, "ds_ordered_count: dword count must be between 1 and 4",
10255 DL.getDebugLoc()));
10256 CountDw = 1;
10257 }
10258 }
10259
10260 if (IndexOperand) {
10261 const Function &Fn = DAG.getMachineFunction().getFunction();
10262 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10263 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
10264 }
10265
10266 if (WaveDone && !WaveRelease) {
10267 // TODO: Move this to IR verifier
10268 const Function &Fn = DAG.getMachineFunction().getFunction();
10269 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10270 Fn, "ds_ordered_count: wave_done requires wave_release",
10271 DL.getDebugLoc()));
10272 }
10273
10274 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10275 unsigned ShaderType =
10277 unsigned Offset0 = OrderedCountIndex << 2;
10278 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10279
10280 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
10281 Offset1 |= (CountDw - 1) << 6;
10282
10283 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
10284 Offset1 |= ShaderType << 2;
10285
10286 unsigned Offset = Offset0 | (Offset1 << 8);
10287
10288 SDValue Ops[] = {
10289 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
10290 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
10291 };
10293 M->getVTList(), Ops, M->getMemoryVT(),
10294 M->getMemOperand());
10295 }
10296 case Intrinsic::amdgcn_raw_buffer_load:
10297 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10298 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10299 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10300 case Intrinsic::amdgcn_raw_buffer_load_format:
10301 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10302 const bool IsFormat =
10303 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10304 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10305
10306 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10307 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10308 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10309 SDValue Ops[] = {
10310 Op.getOperand(0), // Chain
10311 Rsrc, // rsrc
10312 DAG.getConstant(0, DL, MVT::i32), // vindex
10313 VOffset, // voffset
10314 SOffset, // soffset
10315 Offset, // offset
10316 Op.getOperand(5), // cachepolicy, swizzled buffer
10317 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10318 };
10319
10320 auto *M = cast<MemSDNode>(Op);
10321 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
10322 }
10323 case Intrinsic::amdgcn_struct_buffer_load:
10324 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10325 case Intrinsic::amdgcn_struct_buffer_load_format:
10326 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10327 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10328 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10329 const bool IsFormat =
10330 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10331 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10332
10333 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10334 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10335 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10336 SDValue Ops[] = {
10337 Op.getOperand(0), // Chain
10338 Rsrc, // rsrc
10339 Op.getOperand(3), // vindex
10340 VOffset, // voffset
10341 SOffset, // soffset
10342 Offset, // offset
10343 Op.getOperand(6), // cachepolicy, swizzled buffer
10344 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10345 };
10346
10347 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
10348 }
10349 case Intrinsic::amdgcn_raw_tbuffer_load:
10350 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10351 MemSDNode *M = cast<MemSDNode>(Op);
10352 EVT LoadVT = Op.getValueType();
10353 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10354 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
10355 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
10356
10357 SDValue Ops[] = {
10358 Op.getOperand(0), // Chain
10359 Rsrc, // rsrc
10360 DAG.getConstant(0, DL, MVT::i32), // vindex
10361 VOffset, // voffset
10362 SOffset, // soffset
10363 Offset, // offset
10364 Op.getOperand(5), // format
10365 Op.getOperand(6), // cachepolicy, swizzled buffer
10366 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10367 };
10368
10369 if (LoadVT.getScalarType() == MVT::f16)
10370 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10371 Ops);
10372 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10373 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10374 DAG);
10375 }
10376 case Intrinsic::amdgcn_struct_tbuffer_load:
10377 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10378 MemSDNode *M = cast<MemSDNode>(Op);
10379 EVT LoadVT = Op.getValueType();
10380 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
10381 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
10382 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
10383
10384 SDValue Ops[] = {
10385 Op.getOperand(0), // Chain
10386 Rsrc, // rsrc
10387 Op.getOperand(3), // vindex
10388 VOffset, // voffset
10389 SOffset, // soffset
10390 Offset, // offset
10391 Op.getOperand(6), // format
10392 Op.getOperand(7), // cachepolicy, swizzled buffer
10393 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10394 };
10395
10396 if (LoadVT.getScalarType() == MVT::f16)
10397 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10398 Ops);
10399 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
10400 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
10401 DAG);
10402 }
10403 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10404 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10405 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10406 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10408 return lowerStructBufferAtomicIntrin(Op, DAG,
10410 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10411 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10412 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10413 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10414 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10415 return lowerStructBufferAtomicIntrin(Op, DAG,
10417 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10418 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10419 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10420 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10421 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10422 return lowerStructBufferAtomicIntrin(Op, DAG,
10424 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10426 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10427 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10428 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10429 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10430 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10431 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10432 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10433 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10434 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10435 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10436 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10438 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10439 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10440 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10441 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10442 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10444 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10445 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10446 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10447 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10448 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10449 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10450 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10451 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10452 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10453 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10454 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10455 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10456 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10457 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10458 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10459 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10460 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10461 return lowerRawBufferAtomicIntrin(Op, DAG,
10463 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10464 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10465 return lowerStructBufferAtomicIntrin(Op, DAG,
10467 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10468 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10469 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10470 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10471 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10472 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10473 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10474 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10475 return lowerStructBufferAtomicIntrin(Op, DAG,
10477 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10478 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10479 return lowerStructBufferAtomicIntrin(Op, DAG,
10481 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10482 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10483 return lowerStructBufferAtomicIntrin(Op, DAG,
10485 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10487 return lowerStructBufferAtomicIntrin(Op, DAG,
10489 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10491 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10492 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10493 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10494 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10495 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10497 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10498 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10499 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10500 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10501 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10502 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10503 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10504 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10505 return lowerStructBufferAtomicIntrin(Op, DAG,
10507
10508 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10509 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10510 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
10511 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10512 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10513 SDValue Ops[] = {
10514 Op.getOperand(0), // Chain
10515 Op.getOperand(2), // src
10516 Op.getOperand(3), // cmp
10517 Rsrc, // rsrc
10518 DAG.getConstant(0, DL, MVT::i32), // vindex
10519 VOffset, // voffset
10520 SOffset, // soffset
10521 Offset, // offset
10522 Op.getOperand(7), // cachepolicy
10523 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10524 };
10525 EVT VT = Op.getValueType();
10526 auto *M = cast<MemSDNode>(Op);
10527
10529 Op->getVTList(), Ops, VT,
10530 M->getMemOperand());
10531 }
10532 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10533 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10534 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
10535 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
10536 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
10537 SDValue Ops[] = {
10538 Op.getOperand(0), // Chain
10539 Op.getOperand(2), // src
10540 Op.getOperand(3), // cmp
10541 Rsrc, // rsrc
10542 Op.getOperand(5), // vindex
10543 VOffset, // voffset
10544 SOffset, // soffset
10545 Offset, // offset
10546 Op.getOperand(8), // cachepolicy
10547 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10548 };
10549 EVT VT = Op.getValueType();
10550 auto *M = cast<MemSDNode>(Op);
10551
10553 Op->getVTList(), Ops, VT,
10554 M->getMemOperand());
10555 }
10556 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10557 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10558 MemSDNode *M = cast<MemSDNode>(Op);
10559 SDValue NodePtr = M->getOperand(2);
10560 SDValue RayExtent = M->getOperand(3);
10561 SDValue InstanceMask = M->getOperand(4);
10562 SDValue RayOrigin = M->getOperand(5);
10563 SDValue RayDir = M->getOperand(6);
10564 SDValue Offsets = M->getOperand(7);
10565 SDValue TDescr = M->getOperand(8);
10566
10567 assert(NodePtr.getValueType() == MVT::i64);
10568 assert(RayDir.getValueType() == MVT::v3f32);
10569
10570 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10571 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10572 return SDValue();
10573 }
10574
10575 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10576 const unsigned NumVDataDwords = 10;
10577 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10578 int Opcode = AMDGPU::getMIMGOpcode(
10579 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10580 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10581 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10582 assert(Opcode != -1);
10583
10585 Ops.push_back(NodePtr);
10586 Ops.push_back(DAG.getBuildVector(
10587 MVT::v2i32, DL,
10588 {DAG.getBitcast(MVT::i32, RayExtent),
10589 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10590 Ops.push_back(RayOrigin);
10591 Ops.push_back(RayDir);
10592 Ops.push_back(Offsets);
10593 Ops.push_back(TDescr);
10594 Ops.push_back(M->getChain());
10595
10596 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10597 MachineMemOperand *MemRef = M->getMemOperand();
10598 DAG.setNodeMemRefs(NewNode, {MemRef});
10599 return SDValue(NewNode, 0);
10600 }
10601 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10602 MemSDNode *M = cast<MemSDNode>(Op);
10603 SDValue NodePtr = M->getOperand(2);
10604 SDValue RayExtent = M->getOperand(3);
10605 SDValue RayOrigin = M->getOperand(4);
10606 SDValue RayDir = M->getOperand(5);
10607 SDValue RayInvDir = M->getOperand(6);
10608 SDValue TDescr = M->getOperand(7);
10609
10610 assert(NodePtr.getValueType() == MVT::i32 ||
10611 NodePtr.getValueType() == MVT::i64);
10612 assert(RayDir.getValueType() == MVT::v3f16 ||
10613 RayDir.getValueType() == MVT::v3f32);
10614
10615 if (!Subtarget->hasGFX10_AEncoding()) {
10616 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
10617 return SDValue();
10618 }
10619
10620 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
10621 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10622 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10623 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
10624 const bool Is64 = NodePtr.getValueType() == MVT::i64;
10625 const unsigned NumVDataDwords = 4;
10626 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10627 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10628 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10629 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
10630 IsGFX12Plus;
10631 const unsigned BaseOpcodes[2][2] = {
10632 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10633 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10634 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10635 int Opcode;
10636 if (UseNSA) {
10637 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10638 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10639 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10640 : AMDGPU::MIMGEncGfx10NSA,
10641 NumVDataDwords, NumVAddrDwords);
10642 } else {
10643 assert(!IsGFX12Plus);
10644 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
10645 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10646 : AMDGPU::MIMGEncGfx10Default,
10647 NumVDataDwords, NumVAddrDwords);
10648 }
10649 assert(Opcode != -1);
10650
10652
10653 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
10655 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
10656 if (Lanes[0].getValueSizeInBits() == 32) {
10657 for (unsigned I = 0; I < 3; ++I)
10658 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
10659 } else {
10660 if (IsAligned) {
10661 Ops.push_back(DAG.getBitcast(
10662 MVT::i32,
10663 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
10664 Ops.push_back(Lanes[2]);
10665 } else {
10666 SDValue Elt0 = Ops.pop_back_val();
10667 Ops.push_back(DAG.getBitcast(
10668 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
10669 Ops.push_back(DAG.getBitcast(
10670 MVT::i32,
10671 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
10672 }
10673 }
10674 };
10675
10676 if (UseNSA && IsGFX11Plus) {
10677 Ops.push_back(NodePtr);
10678 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10679 Ops.push_back(RayOrigin);
10680 if (IsA16) {
10681 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
10682 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
10683 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
10684 for (unsigned I = 0; I < 3; ++I) {
10685 MergedLanes.push_back(DAG.getBitcast(
10686 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
10687 {DirLanes[I], InvDirLanes[I]})));
10688 }
10689 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
10690 } else {
10691 Ops.push_back(RayDir);
10692 Ops.push_back(RayInvDir);
10693 }
10694 } else {
10695 if (Is64)
10696 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
10697 2);
10698 else
10699 Ops.push_back(NodePtr);
10700
10701 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
10702 packLanes(RayOrigin, true);
10703 packLanes(RayDir, true);
10704 packLanes(RayInvDir, false);
10705 }
10706
10707 if (!UseNSA) {
10708 // Build a single vector containing all the operands so far prepared.
10709 if (NumVAddrDwords > 12) {
10710 SDValue Undef = DAG.getPOISON(MVT::i32);
10711 Ops.append(16 - Ops.size(), Undef);
10712 }
10713 assert(Ops.size() >= 8 && Ops.size() <= 12);
10714 SDValue MergedOps =
10715 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
10716 Ops.clear();
10717 Ops.push_back(MergedOps);
10718 }
10719
10720 Ops.push_back(TDescr);
10721 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
10722 Ops.push_back(M->getChain());
10723
10724 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
10725 MachineMemOperand *MemRef = M->getMemOperand();
10726 DAG.setNodeMemRefs(NewNode, {MemRef});
10727 return SDValue(NewNode, 0);
10728 }
10729 case Intrinsic::amdgcn_global_atomic_fmin_num:
10730 case Intrinsic::amdgcn_global_atomic_fmax_num:
10731 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10732 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10733 MemSDNode *M = cast<MemSDNode>(Op);
10734 SDValue Ops[] = {
10735 M->getOperand(0), // Chain
10736 M->getOperand(2), // Ptr
10737 M->getOperand(3) // Value
10738 };
10739 unsigned Opcode = 0;
10740 switch (IntrID) {
10741 case Intrinsic::amdgcn_global_atomic_fmin_num:
10742 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10743 Opcode = ISD::ATOMIC_LOAD_FMIN;
10744 break;
10745 }
10746 case Intrinsic::amdgcn_global_atomic_fmax_num:
10747 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10748 Opcode = ISD::ATOMIC_LOAD_FMAX;
10749 break;
10750 }
10751 default:
10752 llvm_unreachable("unhandled atomic opcode");
10753 }
10754 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
10755 Ops, M->getMemOperand());
10756 }
10757 case Intrinsic::amdgcn_s_get_barrier_state:
10758 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10759 SDValue Chain = Op->getOperand(0);
10761 unsigned Opc;
10762
10763 if (isa<ConstantSDNode>(Op->getOperand(2))) {
10764 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
10765 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10766 BarID = (BarID >> 4) & 0x3F;
10767 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10768 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
10769 Ops.push_back(K);
10770 Ops.push_back(Chain);
10771 } else {
10772 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10773 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10774 SDValue M0Val;
10775 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
10776 DAG.getShiftAmountConstant(4, MVT::i32, DL));
10777 M0Val = SDValue(
10778 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
10779 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
10780 0);
10781 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10782 } else
10783 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
10784 }
10785
10786 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10787 return SDValue(NewMI, 0);
10788 }
10789 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10790 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10791 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10792 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
10793 SDValue Chain = Op->getOperand(0);
10794 SDValue Ptr = Op->getOperand(2);
10795 EVT VT = Op->getValueType(0);
10796 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
10797 Chain, Ptr, MII->getMemOperand());
10798 }
10799 default:
10800
10801 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10803 return lowerImage(Op, ImageDimIntr, DAG, true);
10804
10805 return SDValue();
10806 }
10807}
10808
10809// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
10810// dwordx4 if on SI and handle TFE loads.
10811SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
10812 SDVTList VTList,
10813 ArrayRef<SDValue> Ops, EVT MemVT,
10814 MachineMemOperand *MMO,
10815 SelectionDAG &DAG) const {
10816 LLVMContext &C = *DAG.getContext();
10817 MachineFunction &MF = DAG.getMachineFunction();
10818 EVT VT = VTList.VTs[0];
10819
10820 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
10821 bool IsTFE = VTList.NumVTs == 3;
10822 if (IsTFE) {
10823 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
10824 unsigned NumOpDWords = NumValueDWords + 1;
10825 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
10826 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
10827 MachineMemOperand *OpDWordsMMO =
10828 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
10829 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
10830 OpDWordsVT, OpDWordsMMO, DAG);
10831 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10832 DAG.getVectorIdxConstant(NumValueDWords, DL));
10833 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
10834 SDValue ValueDWords =
10835 NumValueDWords == 1
10836 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
10838 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
10839 ZeroIdx);
10840 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
10841 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10842 }
10843
10844 if (!Subtarget->hasDwordx3LoadStores() &&
10845 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10846 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
10847 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
10848 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
10849 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
10850 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
10851 WidenedMemVT, WidenedMMO);
10853 DAG.getVectorIdxConstant(0, DL));
10854 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
10855 }
10856
10857 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
10858}
10859
10860SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
10861 bool ImageStore) const {
10862 EVT StoreVT = VData.getValueType();
10863
10864 // No change for f16 and legal vector D16 types.
10865 if (!StoreVT.isVector())
10866 return VData;
10867
10868 SDLoc DL(VData);
10869 unsigned NumElements = StoreVT.getVectorNumElements();
10870
10871 if (Subtarget->hasUnpackedD16VMem()) {
10872 // We need to unpack the packed data to store.
10873 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10874 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10875
10876 EVT EquivStoreVT =
10877 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
10878 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
10879 return DAG.UnrollVectorOp(ZExt.getNode());
10880 }
10881
10882 // The sq block of gfx8.1 does not estimate register use correctly for d16
10883 // image store instructions. The data operand is computed as if it were not a
10884 // d16 image instruction.
10885 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10886 // Bitcast to i16
10887 EVT IntStoreVT = StoreVT.changeTypeToInteger();
10888 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10889
10890 // Decompose into scalars
10892 DAG.ExtractVectorElements(IntVData, Elts);
10893
10894 // Group pairs of i16 into v2i16 and bitcast to i32
10895 SmallVector<SDValue, 4> PackedElts;
10896 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
10897 SDValue Pair =
10898 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
10899 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10900 PackedElts.push_back(IntPair);
10901 }
10902 if ((NumElements % 2) == 1) {
10903 // Handle v3i16
10904 unsigned I = Elts.size() / 2;
10905 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
10906 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
10907 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
10908 PackedElts.push_back(IntPair);
10909 }
10910
10911 // Pad using UNDEF
10912 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
10913
10914 // Build final vector
10915 EVT VecVT =
10916 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
10917 return DAG.getBuildVector(VecVT, DL, PackedElts);
10918 }
10919
10920 if (NumElements == 3) {
10921 EVT IntStoreVT =
10923 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
10924
10925 EVT WidenedStoreVT = EVT::getVectorVT(
10926 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
10927 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
10928 WidenedStoreVT.getStoreSizeInBits());
10929 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
10930 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
10931 }
10932
10933 assert(isTypeLegal(StoreVT));
10934 return VData;
10935}
10936
10937SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10938 SelectionDAG &DAG) const {
10939 SDLoc DL(Op);
10940 SDValue Chain = Op.getOperand(0);
10941 unsigned IntrinsicID = Op.getConstantOperandVal(1);
10942 MachineFunction &MF = DAG.getMachineFunction();
10943
10944 switch (IntrinsicID) {
10945 case Intrinsic::amdgcn_exp_compr: {
10946 if (!Subtarget->hasCompressedExport()) {
10947 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10949 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10950 }
10951 SDValue Src0 = Op.getOperand(4);
10952 SDValue Src1 = Op.getOperand(5);
10953 // Hack around illegal type on SI by directly selecting it.
10954 if (isTypeLegal(Src0.getValueType()))
10955 return SDValue();
10956
10957 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
10958 SDValue Undef = DAG.getPOISON(MVT::f32);
10959 const SDValue Ops[] = {
10960 Op.getOperand(2), // tgt
10961 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
10962 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
10963 Undef, // src2
10964 Undef, // src3
10965 Op.getOperand(7), // vm
10966 DAG.getTargetConstant(1, DL, MVT::i1), // compr
10967 Op.getOperand(3), // en
10968 Op.getOperand(0) // Chain
10969 };
10970
10971 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
10972 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
10973 }
10974
10975 case Intrinsic::amdgcn_struct_tbuffer_store:
10976 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
10977 SDValue VData = Op.getOperand(2);
10978 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
10979 if (IsD16)
10980 VData = handleD16VData(VData, DAG);
10981 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
10982 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
10983 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
10984 SDValue Ops[] = {
10985 Chain,
10986 VData, // vdata
10987 Rsrc, // rsrc
10988 Op.getOperand(4), // vindex
10989 VOffset, // voffset
10990 SOffset, // soffset
10991 Offset, // offset
10992 Op.getOperand(7), // format
10993 Op.getOperand(8), // cachepolicy, swizzled buffer
10994 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
10995 };
10996 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
10998 MemSDNode *M = cast<MemSDNode>(Op);
10999 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11000 M->getMemoryVT(), M->getMemOperand());
11001 }
11002
11003 case Intrinsic::amdgcn_raw_tbuffer_store:
11004 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11005 SDValue VData = Op.getOperand(2);
11006 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
11007 if (IsD16)
11008 VData = handleD16VData(VData, DAG);
11009 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11010 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11011 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11012 SDValue Ops[] = {
11013 Chain,
11014 VData, // vdata
11015 Rsrc, // rsrc
11016 DAG.getConstant(0, DL, MVT::i32), // vindex
11017 VOffset, // voffset
11018 SOffset, // soffset
11019 Offset, // offset
11020 Op.getOperand(6), // format
11021 Op.getOperand(7), // cachepolicy, swizzled buffer
11022 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11023 };
11024 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11026 MemSDNode *M = cast<MemSDNode>(Op);
11027 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11028 M->getMemoryVT(), M->getMemOperand());
11029 }
11030
11031 case Intrinsic::amdgcn_raw_buffer_store:
11032 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11033 case Intrinsic::amdgcn_raw_buffer_store_format:
11034 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11035 const bool IsFormat =
11036 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11037 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11038
11039 SDValue VData = Op.getOperand(2);
11040 EVT VDataVT = VData.getValueType();
11041 EVT EltType = VDataVT.getScalarType();
11042 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11043 if (IsD16) {
11044 VData = handleD16VData(VData, DAG);
11045 VDataVT = VData.getValueType();
11046 }
11047
11048 if (!isTypeLegal(VDataVT)) {
11049 VData =
11050 DAG.getNode(ISD::BITCAST, DL,
11051 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11052 }
11053
11054 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11055 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11056 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11057 SDValue Ops[] = {
11058 Chain,
11059 VData,
11060 Rsrc,
11061 DAG.getConstant(0, DL, MVT::i32), // vindex
11062 VOffset, // voffset
11063 SOffset, // soffset
11064 Offset, // offset
11065 Op.getOperand(6), // cachepolicy, swizzled buffer
11066 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11067 };
11068 unsigned Opc =
11071 MemSDNode *M = cast<MemSDNode>(Op);
11072
11073 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11074 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11075 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
11076
11077 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11078 M->getMemoryVT(), M->getMemOperand());
11079 }
11080
11081 case Intrinsic::amdgcn_struct_buffer_store:
11082 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11083 case Intrinsic::amdgcn_struct_buffer_store_format:
11084 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11085 const bool IsFormat =
11086 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11087 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11088
11089 SDValue VData = Op.getOperand(2);
11090 EVT VDataVT = VData.getValueType();
11091 EVT EltType = VDataVT.getScalarType();
11092 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
11093
11094 if (IsD16) {
11095 VData = handleD16VData(VData, DAG);
11096 VDataVT = VData.getValueType();
11097 }
11098
11099 if (!isTypeLegal(VDataVT)) {
11100 VData =
11101 DAG.getNode(ISD::BITCAST, DL,
11102 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
11103 }
11104
11105 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11106 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11107 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11108 SDValue Ops[] = {
11109 Chain,
11110 VData,
11111 Rsrc,
11112 Op.getOperand(4), // vindex
11113 VOffset, // voffset
11114 SOffset, // soffset
11115 Offset, // offset
11116 Op.getOperand(7), // cachepolicy, swizzled buffer
11117 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11118 };
11119 unsigned Opc =
11122 MemSDNode *M = cast<MemSDNode>(Op);
11123
11124 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
11125 EVT VDataType = VData.getValueType().getScalarType();
11126 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
11127 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
11128
11129 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
11130 M->getMemoryVT(), M->getMemOperand());
11131 }
11132 case Intrinsic::amdgcn_raw_buffer_load_lds:
11133 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11134 case Intrinsic::amdgcn_struct_buffer_load_lds:
11135 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11136 if (!Subtarget->hasVMemToLDSLoad())
11137 return SDValue();
11138 unsigned Opc;
11139 bool HasVIndex =
11140 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11141 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11142 unsigned OpOffset = HasVIndex ? 1 : 0;
11143 SDValue VOffset = Op.getOperand(5 + OpOffset);
11144 bool HasVOffset = !isNullConstant(VOffset);
11145 unsigned Size = Op->getConstantOperandVal(4);
11146
11147 switch (Size) {
11148 default:
11149 return SDValue();
11150 case 1:
11151 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11152 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11153 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11154 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11155 break;
11156 case 2:
11157 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11158 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11159 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11160 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11161 break;
11162 case 4:
11163 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11164 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11165 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11166 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11167 break;
11168 case 12:
11169 if (!Subtarget->hasLDSLoadB96_B128())
11170 return SDValue();
11171 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11172 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11173 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11174 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11175 break;
11176 case 16:
11177 if (!Subtarget->hasLDSLoadB96_B128())
11178 return SDValue();
11179 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11180 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11181 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11182 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11183 break;
11184 }
11185
11186 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11187
11189
11190 if (HasVIndex && HasVOffset)
11191 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
11192 {Op.getOperand(5), // VIndex
11193 VOffset}));
11194 else if (HasVIndex)
11195 Ops.push_back(Op.getOperand(5));
11196 else if (HasVOffset)
11197 Ops.push_back(VOffset);
11198
11199 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11200 Ops.push_back(Rsrc);
11201 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
11202 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
11203 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11204 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
11205 Ops.push_back(DAG.getTargetConstant(
11206 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
11207 DL, MVT::i8)); // cpol
11208 Ops.push_back(DAG.getTargetConstant(
11209 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
11210 ? 1
11211 : 0,
11212 DL, MVT::i8)); // swz
11213 Ops.push_back(M0Val.getValue(0)); // Chain
11214 Ops.push_back(M0Val.getValue(1)); // Glue
11215
11216 auto *M = cast<MemSDNode>(Op);
11217 MachineMemOperand *LoadMMO = M->getMemOperand();
11218 // Don't set the offset value here because the pointer points to the base of
11219 // the buffer.
11220 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11221
11222 MachinePointerInfo StorePtrI = LoadPtrI;
11223 LoadPtrI.V = PoisonValue::get(
11227
11228 auto F = LoadMMO->getFlags() &
11230 LoadMMO =
11232 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11233
11234 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11235 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
11236 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11237
11238 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
11239 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11240
11241 return SDValue(Load, 0);
11242 }
11243 // Buffers are handled by LowerBufferFatPointers, and we're going to go
11244 // for "trust me" that the remaining cases are global pointers until
11245 // such time as we can put two mem operands on an intrinsic.
11246 case Intrinsic::amdgcn_load_to_lds:
11247 case Intrinsic::amdgcn_global_load_lds: {
11248 if (!Subtarget->hasVMemToLDSLoad())
11249 return SDValue();
11250
11251 unsigned Opc;
11252 unsigned Size = Op->getConstantOperandVal(4);
11253 switch (Size) {
11254 default:
11255 return SDValue();
11256 case 1:
11257 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11258 break;
11259 case 2:
11260 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11261 break;
11262 case 4:
11263 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11264 break;
11265 case 12:
11266 if (!Subtarget->hasLDSLoadB96_B128())
11267 return SDValue();
11268 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11269 break;
11270 case 16:
11271 if (!Subtarget->hasLDSLoadB96_B128())
11272 return SDValue();
11273 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11274 break;
11275 }
11276
11277 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
11278
11280
11281 SDValue Addr = Op.getOperand(2); // Global ptr
11282 SDValue VOffset;
11283 // Try to split SAddr and VOffset. Global and LDS pointers share the same
11284 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
11285 if (Addr->isDivergent() && Addr->isAnyAdd()) {
11286 SDValue LHS = Addr.getOperand(0);
11287 SDValue RHS = Addr.getOperand(1);
11288
11289 if (LHS->isDivergent())
11290 std::swap(LHS, RHS);
11291
11292 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
11293 RHS.getOperand(0).getValueType() == MVT::i32) {
11294 // add (i64 sgpr), (zero_extend (i32 vgpr))
11295 Addr = LHS;
11296 VOffset = RHS.getOperand(0);
11297 }
11298 }
11299
11300 Ops.push_back(Addr);
11301 if (!Addr->isDivergent()) {
11303 if (!VOffset)
11304 VOffset =
11305 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
11306 DAG.getTargetConstant(0, DL, MVT::i32)),
11307 0);
11308 Ops.push_back(VOffset);
11309 }
11310
11311 Ops.push_back(Op.getOperand(5)); // Offset
11312 Ops.push_back(Op.getOperand(6)); // CPol
11313 Ops.push_back(M0Val.getValue(0)); // Chain
11314 Ops.push_back(M0Val.getValue(1)); // Glue
11315
11316 auto *M = cast<MemSDNode>(Op);
11317 MachineMemOperand *LoadMMO = M->getMemOperand();
11318 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
11319 LoadPtrI.Offset = Op->getConstantOperandVal(5);
11320 MachinePointerInfo StorePtrI = LoadPtrI;
11321 LoadPtrI.V = PoisonValue::get(
11325 auto F = LoadMMO->getFlags() &
11327 LoadMMO =
11329 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
11330 MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
11331 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
11332 LoadMMO->getAAInfo());
11333
11334 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11335 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
11336
11337 return SDValue(Load, 0);
11338 }
11339 case Intrinsic::amdgcn_end_cf:
11340 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
11341 Op->getOperand(2), Chain),
11342 0);
11343 case Intrinsic::amdgcn_s_barrier_init:
11344 case Intrinsic::amdgcn_s_barrier_signal_var: {
11345 // these two intrinsics have two operands: barrier pointer and member count
11346 SDValue Chain = Op->getOperand(0);
11348 SDValue BarOp = Op->getOperand(2);
11349 SDValue CntOp = Op->getOperand(3);
11350 SDValue M0Val;
11351 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11352 ? AMDGPU::S_BARRIER_INIT_M0
11353 : AMDGPU::S_BARRIER_SIGNAL_M0;
11354 // extract the BarrierID from bits 4-9 of BarOp
11355 SDValue BarID;
11356 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11357 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11358 BarID =
11359 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
11360 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11361 0);
11362 // Member count should be put into M0[ShAmt:+6]
11363 // Barrier ID should be put into M0[5:0]
11364 M0Val =
11365 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
11366 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11367 0);
11368 constexpr unsigned ShAmt = 16;
11369 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
11370 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
11371
11372 M0Val = SDValue(
11373 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
11374
11375 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11376
11377 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11378 return SDValue(NewMI, 0);
11379 }
11380 case Intrinsic::amdgcn_s_barrier_join: {
11381 // these three intrinsics have one operand: barrier pointer
11382 SDValue Chain = Op->getOperand(0);
11384 SDValue BarOp = Op->getOperand(2);
11385 unsigned Opc;
11386
11387 if (isa<ConstantSDNode>(BarOp)) {
11388 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
11389 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11390
11391 // extract the BarrierID from bits 4-9 of the immediate
11392 unsigned BarID = (BarVal >> 4) & 0x3F;
11393 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11394 Ops.push_back(K);
11395 Ops.push_back(Chain);
11396 } else {
11397 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11398
11399 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
11400 SDValue M0Val;
11401 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
11402 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11403 M0Val =
11404 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11405 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11406 0);
11407 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11408 }
11409
11410 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11411 return SDValue(NewMI, 0);
11412 }
11413 case Intrinsic::amdgcn_s_prefetch_data: {
11414 // For non-global address space preserve the chain and remove the call.
11416 return Op.getOperand(0);
11417 return Op;
11418 }
11419 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11420 SDValue Ops[] = {
11421 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
11422 Op.getOperand(3), // offset
11423 Op.getOperand(4), // length
11424 };
11425
11426 MemSDNode *M = cast<MemSDNode>(Op);
11428 Op->getVTList(), Ops, M->getMemoryVT(),
11429 M->getMemOperand());
11430 }
11431 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11432 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11433 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11434 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11435 SDValue Chain = Op->getOperand(0);
11436 SDValue Ptr = Op->getOperand(2);
11437 SDValue Val = Op->getOperand(3);
11438 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
11439 Ptr, MII->getMemOperand());
11440 }
11441 default: {
11442 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11444 return lowerImage(Op, ImageDimIntr, DAG, true);
11445
11446 return Op;
11447 }
11448 }
11449}
11450
11451// Return whether the operation has NoUnsignedWrap property.
11452static bool isNoUnsignedWrap(SDValue Addr) {
11453 return (Addr.getOpcode() == ISD::ADD &&
11454 Addr->getFlags().hasNoUnsignedWrap()) ||
11455 Addr->getOpcode() == ISD::OR;
11456}
11457
11459 EVT PtrVT) const {
11460 return UseSelectionDAGPTRADD && PtrVT == MVT::i64;
11461}
11462
11464 EVT PtrVT) const {
11465 return true;
11466}
11467
11468// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
11469// offset (the offset that is included in bounds checking and swizzling, to be
11470// split between the instruction's voffset and immoffset fields) and soffset
11471// (the offset that is excluded from bounds checking and swizzling, to go in
11472// the instruction's soffset field). This function takes the first kind of
11473// offset and figures out how to split it between voffset and immoffset.
11474std::pair<SDValue, SDValue>
11475SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
11476 SDLoc DL(Offset);
11477 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
11478 SDValue N0 = Offset;
11479 ConstantSDNode *C1 = nullptr;
11480
11481 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
11482 N0 = SDValue();
11483 else if (DAG.isBaseWithConstantOffset(N0)) {
11484 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
11485 // being added, so we can only safely match a 32-bit addition with no
11486 // unsigned overflow.
11487 bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
11488 if (!CheckNUW || isNoUnsignedWrap(N0)) {
11489 C1 = cast<ConstantSDNode>(N0.getOperand(1));
11490 N0 = N0.getOperand(0);
11491 }
11492 }
11493
11494 if (C1) {
11495 unsigned ImmOffset = C1->getZExtValue();
11496 // If the immediate value is too big for the immoffset field, put only bits
11497 // that would normally fit in the immoffset field. The remaining value that
11498 // is copied/added for the voffset field is a large power of 2, and it
11499 // stands more chance of being CSEd with the copy/add for another similar
11500 // load/store.
11501 // However, do not do that rounding down if that is a negative
11502 // number, as it appears to be illegal to have a negative offset in the
11503 // vgpr, even if adding the immediate offset makes it positive.
11504 unsigned Overflow = ImmOffset & ~MaxImm;
11505 ImmOffset -= Overflow;
11506 if ((int32_t)Overflow < 0) {
11507 Overflow += ImmOffset;
11508 ImmOffset = 0;
11509 }
11510 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
11511 if (Overflow) {
11512 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
11513 if (!N0)
11514 N0 = OverflowVal;
11515 else {
11516 SDValue Ops[] = {N0, OverflowVal};
11517 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
11518 }
11519 }
11520 }
11521 if (!N0)
11522 N0 = DAG.getConstant(0, DL, MVT::i32);
11523 if (!C1)
11524 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
11525 return {N0, SDValue(C1, 0)};
11526}
11527
11528// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
11529// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
11530// pointed to by Offsets.
11531void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
11532 SelectionDAG &DAG, SDValue *Offsets,
11533 Align Alignment) const {
11534 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11535 SDLoc DL(CombinedOffset);
11536 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
11537 uint32_t Imm = C->getZExtValue();
11538 uint32_t SOffset, ImmOffset;
11539 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11540 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
11541 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11542 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11543 return;
11544 }
11545 }
11546 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
11547 SDValue N0 = CombinedOffset.getOperand(0);
11548 SDValue N1 = CombinedOffset.getOperand(1);
11549 uint32_t SOffset, ImmOffset;
11550 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
11551 if (Offset >= 0 &&
11552 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
11553 Offsets[0] = N0;
11554 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
11555 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
11556 return;
11557 }
11558 }
11559
11560 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11561 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
11562 : DAG.getConstant(0, DL, MVT::i32);
11563
11564 Offsets[0] = CombinedOffset;
11565 Offsets[1] = SOffsetZero;
11566 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
11567}
11568
11569SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
11570 SelectionDAG &DAG) const {
11571 if (!MaybePointer.getValueType().isScalarInteger())
11572 return MaybePointer;
11573
11574 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
11575 return Rsrc;
11576}
11577
11578// Wrap a global or flat pointer into a buffer intrinsic using the flags
11579// specified in the intrinsic.
11580SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
11581 SelectionDAG &DAG) const {
11582 SDLoc Loc(Op);
11583
11584 SDValue Pointer = Op->getOperand(1);
11585 SDValue Stride = Op->getOperand(2);
11586 SDValue NumRecords = Op->getOperand(3);
11587 SDValue Flags = Op->getOperand(4);
11588
11589 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
11590 SDValue Rsrc;
11591
11592 if (Subtarget->has45BitNumRecordsBufferResource()) {
11593 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
11594 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
11595 // num_records.
11596 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
11597 SDValue NumRecordsLHS =
11598 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
11599 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
11600 SDValue LowHalf =
11601 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
11602
11603 // Build the higher 64-bit value, which has the higher 38-bit num_records,
11604 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
11605 SDValue NumRecordsRHS =
11606 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
11607 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
11608 SDValue ShiftedStride =
11609 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11610 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
11611 SDValue ExtShiftedStrideVec =
11612 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
11613 SDValue ExtShiftedStride =
11614 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11615 SDValue ShiftedFlags =
11616 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
11617 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
11618 SDValue ExtShiftedFlagsVec =
11619 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
11620 SDValue ExtShiftedFlags =
11621 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11622 SDValue CombinedFields =
11623 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11624 SDValue HighHalf =
11625 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11626
11627 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
11628 } else {
11629 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
11630 auto [LowHalf, HighHalf] =
11631 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11632 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
11633 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
11634 SDValue ShiftedStride =
11635 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
11636 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
11637 SDValue NewHighHalf =
11638 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
11639
11640 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
11641 NumRecords, Flags);
11642 }
11643
11644 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11645 return RsrcPtr;
11646}
11647
11648// Handle 8 bit and 16 bit buffer loads
11649SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
11650 EVT LoadVT, SDLoc DL,
11652 MachineMemOperand *MMO,
11653 bool IsTFE) const {
11654 EVT IntVT = LoadVT.changeTypeToInteger();
11655
11656 if (IsTFE) {
11657 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
11660 MachineFunction &MF = DAG.getMachineFunction();
11661 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
11662 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
11663 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
11664 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11665 DAG.getConstant(1, DL, MVT::i32));
11666 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11667 DAG.getConstant(0, DL, MVT::i32));
11668 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
11669 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
11670 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11671 }
11672
11673 unsigned Opc = LoadVT.getScalarType() == MVT::i8
11676
11677 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
11678 SDValue BufferLoad =
11679 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
11680 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
11681 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
11682
11683 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
11684}
11685
11686// Handle 8 bit and 16 bit buffer stores
11687SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
11688 EVT VDataType, SDLoc DL,
11689 SDValue Ops[],
11690 MemSDNode *M) const {
11691 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11692 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
11693
11694 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
11695 Ops[1] = BufferStoreExt;
11696 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11697 : AMDGPUISD::BUFFER_STORE_SHORT;
11698 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
11699 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
11700 M->getMemOperand());
11701}
11702
11704 SDValue Op, const SDLoc &SL, EVT VT) {
11705 if (VT.bitsLT(Op.getValueType()))
11706 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
11707
11708 switch (ExtType) {
11709 case ISD::SEXTLOAD:
11710 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
11711 case ISD::ZEXTLOAD:
11712 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
11713 case ISD::EXTLOAD:
11714 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
11715 case ISD::NON_EXTLOAD:
11716 return Op;
11717 }
11718
11719 llvm_unreachable("invalid ext type");
11720}
11721
11722// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
11723// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
11724SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
11725 DAGCombinerInfo &DCI) const {
11726 SelectionDAG &DAG = DCI.DAG;
11727 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
11728 return SDValue();
11729
11730 // FIXME: Constant loads should all be marked invariant.
11731 unsigned AS = Ld->getAddressSpace();
11732 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
11734 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
11735 return SDValue();
11736
11737 // Don't do this early, since it may interfere with adjacent load merging for
11738 // illegal types. We can avoid losing alignment information for exotic types
11739 // pre-legalize.
11740 EVT MemVT = Ld->getMemoryVT();
11741 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
11742 MemVT.getSizeInBits() >= 32)
11743 return SDValue();
11744
11745 SDLoc SL(Ld);
11746
11747 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
11748 "unexpected vector extload");
11749
11750 // TODO: Drop only high part of range.
11751 SDValue Ptr = Ld->getBasePtr();
11752 SDValue NewLoad = DAG.getLoad(
11753 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
11754 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
11755 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
11756 nullptr); // Drop ranges
11757
11758 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
11759 if (MemVT.isFloatingPoint()) {
11761 "unexpected fp extload");
11762 TruncVT = MemVT.changeTypeToInteger();
11763 }
11764
11765 SDValue Cvt = NewLoad;
11766 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
11767 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
11768 DAG.getValueType(TruncVT));
11769 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
11771 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
11772 } else {
11774 }
11775
11776 EVT VT = Ld->getValueType(0);
11777 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
11778
11779 DCI.AddToWorklist(Cvt.getNode());
11780
11781 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
11782 // the appropriate extension from the 32-bit load.
11783 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
11784 DCI.AddToWorklist(Cvt.getNode());
11785
11786 // Handle conversion back to floating point if necessary.
11787 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
11788
11789 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
11790}
11791
11793 const SIMachineFunctionInfo &Info) {
11794 // TODO: Should check if the address can definitely not access stack.
11795 if (Info.isEntryFunction())
11796 return Info.getUserSGPRInfo().hasFlatScratchInit();
11797 return true;
11798}
11799
11800SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
11801 SDLoc DL(Op);
11802 LoadSDNode *Load = cast<LoadSDNode>(Op);
11803 ISD::LoadExtType ExtType = Load->getExtensionType();
11804 EVT MemVT = Load->getMemoryVT();
11805 MachineMemOperand *MMO = Load->getMemOperand();
11806
11807 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
11808 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
11809 return SDValue();
11810
11811 // FIXME: Copied from PPC
11812 // First, load into 32 bits, then truncate to 1 bit.
11813
11814 SDValue Chain = Load->getChain();
11815 SDValue BasePtr = Load->getBasePtr();
11816
11817 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11818
11819 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
11820 RealMemVT, MMO);
11821
11822 if (!MemVT.isVector()) {
11823 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
11824 NewLD.getValue(1)};
11825
11826 return DAG.getMergeValues(Ops, DL);
11827 }
11828
11830 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
11831 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
11832 DAG.getConstant(I, DL, MVT::i32));
11833
11834 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
11835 }
11836
11837 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
11838
11839 return DAG.getMergeValues(Ops, DL);
11840 }
11841
11842 if (!MemVT.isVector())
11843 return SDValue();
11844
11845 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
11846 "Custom lowering for non-i32 vectors hasn't been implemented.");
11847
11848 Align Alignment = Load->getAlign();
11849 unsigned AS = Load->getAddressSpace();
11850 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
11851 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
11852 return SplitVectorLoad(Op, DAG);
11853 }
11854
11855 MachineFunction &MF = DAG.getMachineFunction();
11856 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
11857 // If there is a possibility that flat instruction access scratch memory
11858 // then we need to use the same legalization rules we use for private.
11859 if (AS == AMDGPUAS::FLAT_ADDRESS &&
11860 !Subtarget->hasMultiDwordFlatScratchAddressing())
11861 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
11864
11865 unsigned NumElements = MemVT.getVectorNumElements();
11866
11867 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11869 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
11870 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
11872 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
11873 Alignment >= Align(4) && NumElements < 32) {
11874 if (MemVT.isPow2VectorType() ||
11875 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11876 return SDValue();
11877 return WidenOrSplitVectorLoad(Op, DAG);
11878 }
11879 // Non-uniform loads will be selected to MUBUF instructions, so they
11880 // have the same legalization requirements as global and private
11881 // loads.
11882 //
11883 }
11884 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
11887 if (NumElements > 4)
11888 return SplitVectorLoad(Op, DAG);
11889 // v3 loads not supported on SI.
11890 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11891 return WidenOrSplitVectorLoad(Op, DAG);
11892
11893 // v3 and v4 loads are supported for private and global memory.
11894 return SDValue();
11895 }
11896 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
11897 // Depending on the setting of the private_element_size field in the
11898 // resource descriptor, we can only make private accesses up to a certain
11899 // size.
11900 switch (Subtarget->getMaxPrivateElementSize()) {
11901 case 4: {
11902 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
11903 return DAG.getMergeValues({Op0, Op1}, DL);
11904 }
11905 case 8:
11906 if (NumElements > 2)
11907 return SplitVectorLoad(Op, DAG);
11908 return SDValue();
11909 case 16:
11910 // Same as global/flat
11911 if (NumElements > 4)
11912 return SplitVectorLoad(Op, DAG);
11913 // v3 loads not supported on SI.
11914 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11915 return WidenOrSplitVectorLoad(Op, DAG);
11916
11917 return SDValue();
11918 default:
11919 llvm_unreachable("unsupported private_element_size");
11920 }
11921 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
11922 unsigned Fast = 0;
11923 auto Flags = Load->getMemOperand()->getFlags();
11925 Load->getAlign(), Flags, &Fast) &&
11926 Fast > 1)
11927 return SDValue();
11928
11929 if (MemVT.isVector())
11930 return SplitVectorLoad(Op, DAG);
11931 }
11932
11934 MemVT, *Load->getMemOperand())) {
11935 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
11936 return DAG.getMergeValues({Op0, Op1}, DL);
11937 }
11938
11939 return SDValue();
11940}
11941
11942SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11943 EVT VT = Op.getValueType();
11944 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
11945 VT.getSizeInBits() == 512)
11946 return splitTernaryVectorOp(Op, DAG);
11947
11948 assert(VT.getSizeInBits() == 64);
11949
11950 SDLoc DL(Op);
11951 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
11952
11953 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
11954 SDValue One = DAG.getConstant(1, DL, MVT::i32);
11955
11956 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11957 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
11958
11959 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
11960 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
11961
11962 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
11963
11964 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
11965 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
11966
11967 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
11968
11969 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
11970 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
11971}
11972
11973// Catch division cases where we can use shortcuts with rcp and rsq
11974// instructions.
11975SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
11976 SelectionDAG &DAG) const {
11977 SDLoc SL(Op);
11978 SDValue LHS = Op.getOperand(0);
11979 SDValue RHS = Op.getOperand(1);
11980 EVT VT = Op.getValueType();
11981 const SDNodeFlags Flags = Op->getFlags();
11982
11983 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
11984
11985 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
11986 // Without !fpmath accuracy information, we can't do more because we don't
11987 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
11988 // f16 is always accurate enough
11989 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
11990 return SDValue();
11991
11992 if (CLHS->isExactlyValue(1.0)) {
11993 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
11994 // the CI documentation has a worst case error of 1 ulp.
11995 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
11996 // use it as long as we aren't trying to use denormals.
11997 //
11998 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
11999
12000 // 1.0 / sqrt(x) -> rsq(x)
12001
12002 // XXX - Is afn sufficient to do this for f64? The maximum ULP
12003 // error seems really high at 2^29 ULP.
12004 // 1.0 / x -> rcp(x)
12005 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12006 }
12007
12008 // Same as for 1.0, but expand the sign out of the constant.
12009 if (CLHS->isExactlyValue(-1.0)) {
12010 // -1.0 / x -> rcp (fneg x)
12011 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12012 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12013 }
12014 }
12015
12016 // For f16 and bf16 require afn or arcp.
12017 // For f32 require afn.
12018 if (!AllowInaccurateRcp &&
12019 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
12020 return SDValue();
12021
12022 // Turn into multiply by the reciprocal.
12023 // x / y -> x * (1.0 / y)
12024 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
12025 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
12026}
12027
12028SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
12029 SelectionDAG &DAG) const {
12030 SDLoc SL(Op);
12031 SDValue X = Op.getOperand(0);
12032 SDValue Y = Op.getOperand(1);
12033 EVT VT = Op.getValueType();
12034 const SDNodeFlags Flags = Op->getFlags();
12035
12036 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
12037 if (!AllowInaccurateDiv)
12038 return SDValue();
12039
12040 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
12041 SDValue One = DAG.getConstantFP(1.0, SL, VT);
12042
12043 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
12044 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12045
12046 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
12047 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
12048 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
12049 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
12050 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
12051 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
12052}
12053
12054static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12055 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
12056 SDNodeFlags Flags) {
12057 if (GlueChain->getNumValues() <= 1) {
12058 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
12059 }
12060
12061 assert(GlueChain->getNumValues() == 3);
12062
12063 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12064 switch (Opcode) {
12065 default:
12066 llvm_unreachable("no chain equivalent for opcode");
12067 case ISD::FMUL:
12068 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12069 break;
12070 }
12071
12072 return DAG.getNode(Opcode, SL, VTList,
12073 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
12074 Flags);
12075}
12076
12077static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
12078 EVT VT, SDValue A, SDValue B, SDValue C,
12079 SDValue GlueChain, SDNodeFlags Flags) {
12080 if (GlueChain->getNumValues() <= 1) {
12081 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
12082 }
12083
12084 assert(GlueChain->getNumValues() == 3);
12085
12086 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
12087 switch (Opcode) {
12088 default:
12089 llvm_unreachable("no chain equivalent for opcode");
12090 case ISD::FMA:
12091 Opcode = AMDGPUISD::FMA_W_CHAIN;
12092 break;
12093 }
12094
12095 return DAG.getNode(Opcode, SL, VTList,
12096 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
12097 Flags);
12098}
12099
12100SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
12101 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12102 return FastLowered;
12103
12104 SDLoc SL(Op);
12105 EVT VT = Op.getValueType();
12106 SDValue LHS = Op.getOperand(0);
12107 SDValue RHS = Op.getOperand(1);
12108
12109 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
12110 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
12111
12112 if (VT == MVT::bf16) {
12113 SDValue ExtDiv =
12114 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
12115 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
12116 DAG.getTargetConstant(0, SL, MVT::i32));
12117 }
12118
12119 assert(VT == MVT::f16);
12120
12121 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
12122 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
12123 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
12124 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
12125 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12126 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
12127 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
12128 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
12129 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
12130 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
12131 // q16.u = opx(V_CVT_F16_F32, q32.u);
12132 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
12133
12134 // We will use ISD::FMA on targets that don't support ISD::FMAD.
12135 unsigned FMADOpCode =
12137 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12138 SDValue Rcp =
12139 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
12140 SDValue Quot =
12141 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
12142 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12143 Op->getFlags());
12144 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
12145 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12146 Op->getFlags());
12147 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
12148 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
12149 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
12150 DAG.getConstant(0xff800000, SL, MVT::i32));
12151 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12152 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
12153 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
12154 DAG.getTargetConstant(0, SL, MVT::i32));
12155 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
12156 Op->getFlags());
12157}
12158
12159// Faster 2.5 ULP division that does not support denormals.
12160SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
12161 SDNodeFlags Flags = Op->getFlags();
12162 SDLoc SL(Op);
12163 SDValue LHS = Op.getOperand(1);
12164 SDValue RHS = Op.getOperand(2);
12165
12166 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
12167
12168 const APFloat K0Val(0x1p+96f);
12169 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
12170
12171 const APFloat K1Val(0x1p-32f);
12172 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
12173
12174 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12175
12176 EVT SetCCVT =
12177 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
12178
12179 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
12180
12181 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
12182
12183 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
12184
12185 // rcp does not support denormals.
12186 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
12187
12188 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
12189
12190 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
12191}
12192
12193// Returns immediate value for setting the F32 denorm mode when using the
12194// S_DENORM_MODE instruction.
12197 const GCNSubtarget *ST) {
12198 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
12199 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12200 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12201 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
12202}
12203
12204SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
12205 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
12206 return FastLowered;
12207
12208 // The selection matcher assumes anything with a chain selecting to a
12209 // mayRaiseFPException machine instruction. Since we're introducing a chain
12210 // here, we need to explicitly report nofpexcept for the regular fdiv
12211 // lowering.
12212 SDNodeFlags Flags = Op->getFlags();
12213 Flags.setNoFPExcept(true);
12214
12215 SDLoc SL(Op);
12216 SDValue LHS = Op.getOperand(0);
12217 SDValue RHS = Op.getOperand(1);
12218
12219 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
12220
12221 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
12222
12223 SDValue DenominatorScaled =
12224 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
12225 SDValue NumeratorScaled =
12226 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
12227
12228 // Denominator is scaled to not be denormal, so using rcp is ok.
12229 SDValue ApproxRcp =
12230 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12231 SDValue NegDivScale0 =
12232 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12233
12234 using namespace AMDGPU::Hwreg;
12235 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12236 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
12237
12238 const MachineFunction &MF = DAG.getMachineFunction();
12239 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12240 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
12241
12242 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
12243 const bool HasDynamicDenormals =
12244 (DenormMode.Input == DenormalMode::Dynamic) ||
12245 (DenormMode.Output == DenormalMode::Dynamic);
12246
12247 SDValue SavedDenormMode;
12248
12249 if (!PreservesDenormals) {
12250 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
12251 // lowering. The chain dependence is insufficient, and we need glue. We do
12252 // not need the glue variants in a strictfp function.
12253
12254 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12255
12256 SDValue Glue = DAG.getEntryNode();
12257 if (HasDynamicDenormals) {
12258 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
12259 DAG.getVTList(MVT::i32, MVT::Glue),
12260 {BitField, Glue});
12261 SavedDenormMode = SDValue(GetReg, 0);
12262
12263 Glue = DAG.getMergeValues(
12264 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
12265 }
12266
12267 SDNode *EnableDenorm;
12268 if (Subtarget->hasDenormModeInst()) {
12269 const SDValue EnableDenormValue =
12271
12272 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12273 EnableDenormValue)
12274 .getNode();
12275 } else {
12276 const SDValue EnableDenormValue =
12277 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
12278 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12279 {EnableDenormValue, BitField, Glue});
12280 }
12281
12282 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
12283 SDValue(EnableDenorm, 1)};
12284
12285 NegDivScale0 = DAG.getMergeValues(Ops, SL);
12286 }
12287
12288 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
12289 ApproxRcp, One, NegDivScale0, Flags);
12290
12291 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
12292 ApproxRcp, Fma0, Flags);
12293
12294 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
12295 Fma1, Flags);
12296
12297 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
12298 NumeratorScaled, Mul, Flags);
12299
12300 SDValue Fma3 =
12301 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
12302
12303 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
12304 NumeratorScaled, Fma3, Flags);
12305
12306 if (!PreservesDenormals) {
12307 SDNode *DisableDenorm;
12308 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12309 const SDValue DisableDenormValue = getSPDenormModeValue(
12310 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
12311
12312 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
12313 DisableDenorm =
12314 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12315 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
12316 .getNode();
12317 } else {
12318 assert(HasDynamicDenormals == (bool)SavedDenormMode);
12319 const SDValue DisableDenormValue =
12320 HasDynamicDenormals
12321 ? SavedDenormMode
12322 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
12323
12324 DisableDenorm = DAG.getMachineNode(
12325 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12326 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
12327 }
12328
12329 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
12330 SDValue(DisableDenorm, 0), DAG.getRoot());
12331 DAG.setRoot(OutputChain);
12332 }
12333
12334 SDValue Scale = NumeratorScaled.getValue(1);
12335 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
12336 {Fma4, Fma1, Fma3, Scale}, Flags);
12337
12338 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
12339}
12340
12341SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
12342 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
12343 return FastLowered;
12344
12345 SDLoc SL(Op);
12346 SDValue X = Op.getOperand(0);
12347 SDValue Y = Op.getOperand(1);
12348
12349 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
12350
12351 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
12352
12353 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
12354
12355 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12356
12357 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12358
12359 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
12360
12361 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
12362
12363 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
12364
12365 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
12366
12367 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
12368 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
12369
12370 SDValue Fma4 =
12371 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
12372
12373 SDValue Scale;
12374
12375 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12376 // Workaround a hardware bug on SI where the condition output from div_scale
12377 // is not usable.
12378
12379 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
12380
12381 // Figure out if the scale to use for div_fmas.
12382 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
12383 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
12384 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12385 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12386
12387 SDValue NumHi =
12388 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
12389 SDValue DenHi =
12390 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
12391
12392 SDValue Scale0Hi =
12393 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
12394 SDValue Scale1Hi =
12395 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
12396
12397 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
12398 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
12399 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
12400 } else {
12401 Scale = DivScale1.getValue(1);
12402 }
12403
12404 SDValue Fmas =
12405 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
12406
12407 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
12408}
12409
12410SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
12411 EVT VT = Op.getValueType();
12412
12413 if (VT == MVT::f32)
12414 return LowerFDIV32(Op, DAG);
12415
12416 if (VT == MVT::f64)
12417 return LowerFDIV64(Op, DAG);
12418
12419 if (VT == MVT::f16 || VT == MVT::bf16)
12420 return LowerFDIV16(Op, DAG);
12421
12422 llvm_unreachable("Unexpected type for fdiv");
12423}
12424
12425SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
12426 SDLoc dl(Op);
12427 SDValue Val = Op.getOperand(0);
12428 EVT VT = Val.getValueType();
12429 EVT ResultExpVT = Op->getValueType(1);
12430 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12431
12432 SDValue Mant = DAG.getNode(
12434 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
12435
12436 SDValue Exp = DAG.getNode(
12437 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
12438 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
12439
12440 if (Subtarget->hasFractBug()) {
12441 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
12442 SDValue Inf =
12444
12445 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
12446 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
12447 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
12448 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
12449 }
12450
12451 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
12452 return DAG.getMergeValues({Mant, CastExp}, dl);
12453}
12454
12455SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
12456 SDLoc DL(Op);
12457 StoreSDNode *Store = cast<StoreSDNode>(Op);
12458 EVT VT = Store->getMemoryVT();
12459
12460 if (VT == MVT::i1) {
12461 return DAG.getTruncStore(
12462 Store->getChain(), DL,
12463 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
12464 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
12465 }
12466
12467 assert(VT.isVector() &&
12468 Store->getValue().getValueType().getScalarType() == MVT::i32);
12469
12470 unsigned AS = Store->getAddressSpace();
12471 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
12472 Store->getAlign().value() < VT.getStoreSize() &&
12473 VT.getSizeInBits() > 32) {
12474 return SplitVectorStore(Op, DAG);
12475 }
12476
12477 MachineFunction &MF = DAG.getMachineFunction();
12478 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12479 // If there is a possibility that flat instruction access scratch memory
12480 // then we need to use the same legalization rules we use for private.
12481 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12482 !Subtarget->hasMultiDwordFlatScratchAddressing())
12483 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
12486
12487 unsigned NumElements = VT.getVectorNumElements();
12489 if (NumElements > 4)
12490 return SplitVectorStore(Op, DAG);
12491 // v3 stores not supported on SI.
12492 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12493 return SplitVectorStore(Op, DAG);
12494
12496 VT, *Store->getMemOperand()))
12497 return expandUnalignedStore(Store, DAG);
12498
12499 return SDValue();
12500 }
12501 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
12502 switch (Subtarget->getMaxPrivateElementSize()) {
12503 case 4:
12504 return scalarizeVectorStore(Store, DAG);
12505 case 8:
12506 if (NumElements > 2)
12507 return SplitVectorStore(Op, DAG);
12508 return SDValue();
12509 case 16:
12510 if (NumElements > 4 ||
12511 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12512 return SplitVectorStore(Op, DAG);
12513 return SDValue();
12514 default:
12515 llvm_unreachable("unsupported private_element_size");
12516 }
12517 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
12518 unsigned Fast = 0;
12519 auto Flags = Store->getMemOperand()->getFlags();
12521 Store->getAlign(), Flags, &Fast) &&
12522 Fast > 1)
12523 return SDValue();
12524
12525 if (VT.isVector())
12526 return SplitVectorStore(Op, DAG);
12527
12528 return expandUnalignedStore(Store, DAG);
12529 }
12530
12531 // Probably an invalid store. If so we'll end up emitting a selection error.
12532 return SDValue();
12533}
12534
12535// Avoid the full correct expansion for f32 sqrt when promoting from f16.
12536SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
12537 SDLoc SL(Op);
12538 assert(!Subtarget->has16BitInsts());
12539 SDNodeFlags Flags = Op->getFlags();
12540 SDValue Ext =
12541 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
12542
12543 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
12544 SDValue Sqrt =
12545 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
12546
12547 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
12548 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
12549}
12550
12551SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
12552 SDLoc DL(Op);
12553 SDNodeFlags Flags = Op->getFlags();
12554 MVT VT = Op.getValueType().getSimpleVT();
12555 const SDValue X = Op.getOperand(0);
12556
12557 if (allowApproxFunc(DAG, Flags)) {
12558 // Instruction is 1ulp but ignores denormals.
12559 return DAG.getNode(
12561 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
12562 }
12563
12564 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
12565 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
12566
12567 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
12568
12569 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
12570
12571 SDValue SqrtX =
12572 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
12573
12574 SDValue SqrtS;
12575 if (needsDenormHandlingF32(DAG, X, Flags)) {
12576 SDValue SqrtID =
12577 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
12578 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
12579
12580 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
12581 SDValue SqrtSNextDownInt =
12582 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12583 DAG.getAllOnesConstant(DL, MVT::i32));
12584 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
12585
12586 SDValue NegSqrtSNextDown =
12587 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
12588
12589 SDValue SqrtVP =
12590 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
12591
12592 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
12593 DAG.getConstant(1, DL, MVT::i32));
12594 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
12595
12596 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
12597 SDValue SqrtVS =
12598 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
12599
12600 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
12601 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
12602
12603 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
12604 Flags);
12605
12606 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
12607 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
12608 Flags);
12609 } else {
12610 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
12611
12612 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
12613
12614 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
12615 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
12616 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
12617
12618 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
12619 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
12620 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
12621
12622 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
12623 SDValue SqrtD =
12624 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
12625 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
12626 }
12627
12628 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
12629
12630 SDValue ScaledDown =
12631 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
12632
12633 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
12634 SDValue IsZeroOrInf =
12635 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12636 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12637
12638 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
12639}
12640
12641SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
12642 // For double type, the SQRT and RSQ instructions don't have required
12643 // precision, we apply Goldschmidt's algorithm to improve the result:
12644 //
12645 // y0 = rsq(x)
12646 // g0 = x * y0
12647 // h0 = 0.5 * y0
12648 //
12649 // r0 = 0.5 - h0 * g0
12650 // g1 = g0 * r0 + g0
12651 // h1 = h0 * r0 + h0
12652 //
12653 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
12654 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
12655 // h2 = h1 * r1 + h1
12656 //
12657 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
12658 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
12659 //
12660 // sqrt(x) = g3
12661
12662 SDNodeFlags Flags = Op->getFlags();
12663
12664 SDLoc DL(Op);
12665
12666 SDValue X = Op.getOperand(0);
12667 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
12668
12669 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
12670
12671 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
12672
12673 // Scale up input if it is too small.
12674 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
12675 SDValue ScaleUp =
12676 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
12677 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
12678
12679 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
12680
12681 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
12682
12683 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
12684 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
12685
12686 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
12687 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
12688
12689 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
12690
12691 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
12692
12693 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
12694 SDValue SqrtD0 =
12695 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
12696
12697 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
12698
12699 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
12700 SDValue SqrtD1 =
12701 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
12702
12703 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
12704
12705 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
12706 SDValue ScaleDown =
12707 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
12708 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12709
12710 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
12711 // with finite only or nsz because rsq(+/-0) = +/-inf
12712
12713 // TODO: Check for DAZ and expand to subnormals
12714 SDValue IsZeroOrInf =
12715 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
12716 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
12717
12718 // If x is +INF, +0, or -0, use its original value
12719 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
12720 Flags);
12721}
12722
12723SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
12724 SDLoc DL(Op);
12725 EVT VT = Op.getValueType();
12726 SDValue Arg = Op.getOperand(0);
12727 SDValue TrigVal;
12728
12729 // Propagate fast-math flags so that the multiply we introduce can be folded
12730 // if Arg is already the result of a multiply by constant.
12731 auto Flags = Op->getFlags();
12732
12733 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
12734
12735 if (Subtarget->hasTrigReducedRange()) {
12736 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12737 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
12738 } else {
12739 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
12740 }
12741
12742 switch (Op.getOpcode()) {
12743 case ISD::FCOS:
12744 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
12745 case ISD::FSIN:
12746 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
12747 default:
12748 llvm_unreachable("Wrong trig opcode");
12749 }
12750}
12751
12752SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
12753 SelectionDAG &DAG) const {
12754 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
12755 assert(AtomicNode->isCompareAndSwap());
12756 unsigned AS = AtomicNode->getAddressSpace();
12757
12758 // No custom lowering required for local address space
12760 return Op;
12761
12762 // Non-local address space requires custom lowering for atomic compare
12763 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
12764 SDLoc DL(Op);
12765 SDValue ChainIn = Op.getOperand(0);
12766 SDValue Addr = Op.getOperand(1);
12767 SDValue Old = Op.getOperand(2);
12768 SDValue New = Op.getOperand(3);
12769 EVT VT = Op.getValueType();
12770 MVT SimpleVT = VT.getSimpleVT();
12771 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
12772
12773 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
12774 SDValue Ops[] = {ChainIn, Addr, NewOld};
12775
12777 Op->getVTList(), Ops, VT,
12778 AtomicNode->getMemOperand());
12779}
12780
12781//===----------------------------------------------------------------------===//
12782// Custom DAG optimizations
12783//===----------------------------------------------------------------------===//
12784
12785SDValue
12786SITargetLowering::performUCharToFloatCombine(SDNode *N,
12787 DAGCombinerInfo &DCI) const {
12788 EVT VT = N->getValueType(0);
12789 EVT ScalarVT = VT.getScalarType();
12790 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12791 return SDValue();
12792
12793 SelectionDAG &DAG = DCI.DAG;
12794 SDLoc DL(N);
12795
12796 SDValue Src = N->getOperand(0);
12797 EVT SrcVT = Src.getValueType();
12798
12799 // TODO: We could try to match extracting the higher bytes, which would be
12800 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
12801 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
12802 // about in practice.
12803 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12804 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
12805 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
12806 DCI.AddToWorklist(Cvt.getNode());
12807
12808 // For the f16 case, fold to a cast to f32 and then cast back to f16.
12809 if (ScalarVT != MVT::f32) {
12810 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
12811 DAG.getTargetConstant(0, DL, MVT::i32));
12812 }
12813 return Cvt;
12814 }
12815 }
12816
12817 return SDValue();
12818}
12819
12820SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
12821 DAGCombinerInfo &DCI) const {
12822 SDValue MagnitudeOp = N->getOperand(0);
12823 SDValue SignOp = N->getOperand(1);
12824
12825 // The generic combine for fcopysign + fp cast is too conservative with
12826 // vectors, and also gets confused by the splitting we will perform here, so
12827 // peek through FP casts.
12828 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
12829 SignOp.getOpcode() == ISD::FP_ROUND)
12830 SignOp = SignOp.getOperand(0);
12831
12832 SelectionDAG &DAG = DCI.DAG;
12833 SDLoc DL(N);
12834 EVT SignVT = SignOp.getValueType();
12835
12836 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
12837 // lower half with a copy.
12838 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
12839 EVT MagVT = MagnitudeOp.getValueType();
12840
12841 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
12842
12843 if (MagVT.getScalarType() == MVT::f64) {
12844 EVT F32VT = MagVT.isVector()
12845 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12846 : MVT::v2f32;
12847
12848 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
12849
12851 for (unsigned I = 0; I != NumElts; ++I) {
12852 SDValue MagLo =
12853 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12854 DAG.getConstant(2 * I, DL, MVT::i32));
12855 SDValue MagHi =
12856 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
12857 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12858
12859 SDValue SignOpElt =
12860 MagVT.isVector()
12862 SignOp, DAG.getConstant(I, DL, MVT::i32))
12863 : SignOp;
12864
12865 SDValue HiOp =
12866 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
12867
12868 SDValue Vector =
12869 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
12870
12871 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
12872 NewElts.push_back(NewElt);
12873 }
12874
12875 if (NewElts.size() == 1)
12876 return NewElts[0];
12877
12878 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
12879 }
12880
12881 if (SignVT.getScalarType() != MVT::f64)
12882 return SDValue();
12883
12884 // Reduce width of sign operand, we only need the highest bit.
12885 //
12886 // fcopysign f64:x, f64:y ->
12887 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
12888 // TODO: In some cases it might make sense to go all the way to f16.
12889
12890 EVT F32VT = MagVT.isVector()
12891 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
12892 : MVT::v2f32;
12893
12894 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
12895
12896 SmallVector<SDValue, 8> F32Signs;
12897 for (unsigned I = 0; I != NumElts; ++I) {
12898 // Take sign from odd elements of cast vector
12899 SDValue SignAsF32 =
12900 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
12901 DAG.getConstant(2 * I + 1, DL, MVT::i32));
12902 F32Signs.push_back(SignAsF32);
12903 }
12904
12905 SDValue NewSign =
12906 NumElts == 1
12907 ? F32Signs.back()
12909 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
12910 F32Signs);
12911
12912 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
12913 NewSign);
12914}
12915
12916// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
12917// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
12918// bits
12919
12920// This is a variant of
12921// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
12922//
12923// The normal DAG combiner will do this, but only if the add has one use since
12924// that would increase the number of instructions.
12925//
12926// This prevents us from seeing a constant offset that can be folded into a
12927// memory instruction's addressing mode. If we know the resulting add offset of
12928// a pointer can be folded into an addressing offset, we can replace the pointer
12929// operand with the add of new constant offset. This eliminates one of the uses,
12930// and may allow the remaining use to also be simplified.
12931//
12932SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
12933 EVT MemVT,
12934 DAGCombinerInfo &DCI) const {
12935 SDValue N0 = N->getOperand(0);
12936 SDValue N1 = N->getOperand(1);
12937
12938 // We only do this to handle cases where it's profitable when there are
12939 // multiple uses of the add, so defer to the standard combine.
12940 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
12941 return SDValue();
12942
12943 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
12944 if (!CN1)
12945 return SDValue();
12946
12947 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
12948 if (!CAdd)
12949 return SDValue();
12950
12951 SelectionDAG &DAG = DCI.DAG;
12952
12953 if (N0->getOpcode() == ISD::OR &&
12954 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
12955 return SDValue();
12956
12957 // If the resulting offset is too large, we can't fold it into the
12958 // addressing mode offset.
12959 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
12960 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
12961
12962 AddrMode AM;
12963 AM.HasBaseReg = true;
12964 AM.BaseOffs = Offset.getSExtValue();
12965 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
12966 return SDValue();
12967
12968 SDLoc SL(N);
12969 EVT VT = N->getValueType(0);
12970
12971 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
12972 SDValue COffset = DAG.getConstant(Offset, SL, VT);
12973
12974 SDNodeFlags Flags;
12975 Flags.setNoUnsignedWrap(
12976 N->getFlags().hasNoUnsignedWrap() &&
12977 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
12978
12979 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
12980 // be sure that the new left operand is a proper base pointer.
12981 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
12982}
12983
12984/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
12985/// by the chain and intrinsic ID. Theoretically we would also need to check the
12986/// specific intrinsic, but they all place the pointer operand first.
12987static unsigned getBasePtrIndex(const MemSDNode *N) {
12988 switch (N->getOpcode()) {
12989 case ISD::STORE:
12992 return 2;
12993 default:
12994 return 1;
12995 }
12996}
12997
12998SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
12999 DAGCombinerInfo &DCI) const {
13000 SelectionDAG &DAG = DCI.DAG;
13001
13002 unsigned PtrIdx = getBasePtrIndex(N);
13003 SDValue Ptr = N->getOperand(PtrIdx);
13004
13005 // TODO: We could also do this for multiplies.
13006 if (Ptr.getOpcode() == ISD::SHL) {
13007 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
13008 N->getMemoryVT(), DCI);
13009 if (NewPtr) {
13010 SmallVector<SDValue, 8> NewOps(N->ops());
13011
13012 NewOps[PtrIdx] = NewPtr;
13013 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
13014 }
13015 }
13016
13017 return SDValue();
13018}
13019
13020static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
13021 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13022 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13023 (Opc == ISD::XOR && Val == 0);
13024}
13025
13026// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
13027// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
13028// integer combine opportunities since most 64-bit operations are decomposed
13029// this way. TODO: We won't want this for SALU especially if it is an inline
13030// immediate.
13031SDValue SITargetLowering::splitBinaryBitConstantOp(
13032 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
13033 const ConstantSDNode *CRHS) const {
13034 uint64_t Val = CRHS->getZExtValue();
13035 uint32_t ValLo = Lo_32(Val);
13036 uint32_t ValHi = Hi_32(Val);
13037 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13038
13039 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
13041 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
13042 // We have 64-bit scalar and/or/xor, but do not have vector forms.
13043 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
13044 !CRHS->user_begin()->isDivergent())
13045 return SDValue();
13046
13047 // If we need to materialize a 64-bit immediate, it will be split up later
13048 // anyway. Avoid creating the harder to understand 64-bit immediate
13049 // materialization.
13050 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
13051 }
13052
13053 return SDValue();
13054}
13055
13057 if (V.getValueType() != MVT::i1)
13058 return false;
13059 switch (V.getOpcode()) {
13060 default:
13061 break;
13062 case ISD::SETCC:
13063 case ISD::IS_FPCLASS:
13065 return true;
13066 case ISD::AND:
13067 case ISD::OR:
13068 case ISD::XOR:
13069 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
13070 case ISD::SADDO:
13071 case ISD::UADDO:
13072 case ISD::SSUBO:
13073 case ISD::USUBO:
13074 case ISD::SMULO:
13075 case ISD::UMULO:
13076 return V.getResNo() == 1;
13078 unsigned IntrinsicID = V.getConstantOperandVal(0);
13079 switch (IntrinsicID) {
13080 case Intrinsic::amdgcn_is_shared:
13081 case Intrinsic::amdgcn_is_private:
13082 return true;
13083 default:
13084 return false;
13085 }
13086
13087 return false;
13088 }
13089 }
13090 return false;
13091}
13092
13093// If a constant has all zeroes or all ones within each byte return it.
13094// Otherwise return 0.
13096 // 0xff for any zero byte in the mask
13097 uint32_t ZeroByteMask = 0;
13098 if (!(C & 0x000000ff))
13099 ZeroByteMask |= 0x000000ff;
13100 if (!(C & 0x0000ff00))
13101 ZeroByteMask |= 0x0000ff00;
13102 if (!(C & 0x00ff0000))
13103 ZeroByteMask |= 0x00ff0000;
13104 if (!(C & 0xff000000))
13105 ZeroByteMask |= 0xff000000;
13106 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
13107 if ((NonZeroByteMask & C) != NonZeroByteMask)
13108 return 0; // Partial bytes selected.
13109 return C;
13110}
13111
13112// Check if a node selects whole bytes from its operand 0 starting at a byte
13113// boundary while masking the rest. Returns select mask as in the v_perm_b32
13114// or -1 if not succeeded.
13115// Note byte select encoding:
13116// value 0-3 selects corresponding source byte;
13117// value 0xc selects zero;
13118// value 0xff selects 0xff.
13120 assert(V.getValueSizeInBits() == 32);
13121
13122 if (V.getNumOperands() != 2)
13123 return ~0;
13124
13125 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
13126 if (!N1)
13127 return ~0;
13128
13129 uint32_t C = N1->getZExtValue();
13130
13131 switch (V.getOpcode()) {
13132 default:
13133 break;
13134 case ISD::AND:
13135 if (uint32_t ConstMask = getConstantPermuteMask(C))
13136 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13137 break;
13138
13139 case ISD::OR:
13140 if (uint32_t ConstMask = getConstantPermuteMask(C))
13141 return (0x03020100 & ~ConstMask) | ConstMask;
13142 break;
13143
13144 case ISD::SHL:
13145 if (C % 8)
13146 return ~0;
13147
13148 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
13149
13150 case ISD::SRL:
13151 if (C % 8)
13152 return ~0;
13153
13154 return uint32_t(0x0c0c0c0c03020100ull >> C);
13155 }
13156
13157 return ~0;
13158}
13159
13160SDValue SITargetLowering::performAndCombine(SDNode *N,
13161 DAGCombinerInfo &DCI) const {
13162 if (DCI.isBeforeLegalize())
13163 return SDValue();
13164
13165 SelectionDAG &DAG = DCI.DAG;
13166 EVT VT = N->getValueType(0);
13167 SDValue LHS = N->getOperand(0);
13168 SDValue RHS = N->getOperand(1);
13169
13170 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
13171 if (VT == MVT::i64 && CRHS) {
13172 if (SDValue Split =
13173 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
13174 return Split;
13175 }
13176
13177 if (CRHS && VT == MVT::i32) {
13178 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
13179 // nb = number of trailing zeroes in mask
13180 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
13181 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
13182 uint64_t Mask = CRHS->getZExtValue();
13183 unsigned Bits = llvm::popcount(Mask);
13184 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
13185 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
13186 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
13187 unsigned Shift = CShift->getZExtValue();
13188 unsigned NB = CRHS->getAPIntValue().countr_zero();
13189 unsigned Offset = NB + Shift;
13190 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
13191 SDLoc SL(N);
13192 SDValue BFE =
13193 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
13194 DAG.getConstant(Offset, SL, MVT::i32),
13195 DAG.getConstant(Bits, SL, MVT::i32));
13196 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
13197 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
13198 DAG.getValueType(NarrowVT));
13199 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
13200 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
13201 return Shl;
13202 }
13203 }
13204 }
13205
13206 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13207 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
13208 isa<ConstantSDNode>(LHS.getOperand(2))) {
13209 uint32_t Sel = getConstantPermuteMask(Mask);
13210 if (!Sel)
13211 return SDValue();
13212
13213 // Select 0xc for all zero bytes
13214 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13215 SDLoc DL(N);
13216 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13217 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13218 }
13219 }
13220
13221 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
13222 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
13223 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
13224 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13225 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
13226
13227 SDValue X = LHS.getOperand(0);
13228 SDValue Y = RHS.getOperand(0);
13229 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
13230 !isTypeLegal(X.getValueType()))
13231 return SDValue();
13232
13233 if (LCC == ISD::SETO) {
13234 if (X != LHS.getOperand(1))
13235 return SDValue();
13236
13237 if (RCC == ISD::SETUNE) {
13238 const ConstantFPSDNode *C1 =
13239 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
13240 if (!C1 || !C1->isInfinity() || C1->isNegative())
13241 return SDValue();
13242
13243 const uint32_t Mask = SIInstrFlags::N_NORMAL |
13247
13248 static_assert(
13251 0x3ff) == Mask,
13252 "mask not equal");
13253
13254 SDLoc DL(N);
13255 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
13256 DAG.getConstant(Mask, DL, MVT::i32));
13257 }
13258 }
13259 }
13260
13261 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13262 std::swap(LHS, RHS);
13263
13264 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13265 RHS.hasOneUse()) {
13266 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
13267 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
13268 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
13269 // | n_nan)
13270 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13271 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
13272 (RHS.getOperand(0) == LHS.getOperand(0) &&
13273 LHS.getOperand(0) == LHS.getOperand(1))) {
13274 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
13275 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
13276 : Mask->getZExtValue() & OrdMask;
13277
13278 SDLoc DL(N);
13279 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
13280 DAG.getConstant(NewMask, DL, MVT::i32));
13281 }
13282 }
13283
13284 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
13285 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
13286 // and x, (sext cc from i1) => select cc, x, 0
13287 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
13288 std::swap(LHS, RHS);
13289 if (isBoolSGPR(RHS.getOperand(0)))
13290 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
13291 DAG.getConstant(0, SDLoc(N), MVT::i32));
13292 }
13293
13294 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13295 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13296 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13297 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13298 uint32_t LHSMask = getPermuteMask(LHS);
13299 uint32_t RHSMask = getPermuteMask(RHS);
13300 if (LHSMask != ~0u && RHSMask != ~0u) {
13301 // Canonicalize the expression in an attempt to have fewer unique masks
13302 // and therefore fewer registers used to hold the masks.
13303 if (LHSMask > RHSMask) {
13304 std::swap(LHSMask, RHSMask);
13305 std::swap(LHS, RHS);
13306 }
13307
13308 // Select 0xc for each lane used from source operand. Zero has 0xc mask
13309 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
13310 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13311 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13312
13313 // Check of we need to combine values from two sources within a byte.
13314 if (!(LHSUsedLanes & RHSUsedLanes) &&
13315 // If we select high and lower word keep it for SDWA.
13316 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
13317 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13318 // Each byte in each mask is either selector mask 0-3, or has higher
13319 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
13320 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
13321 // mask which is not 0xff wins. By anding both masks we have a correct
13322 // result except that 0x0c shall be corrected to give 0x0c only.
13323 uint32_t Mask = LHSMask & RHSMask;
13324 for (unsigned I = 0; I < 32; I += 8) {
13325 uint32_t ByteSel = 0xff << I;
13326 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13327 Mask &= (0x0c << I) & 0xffffffff;
13328 }
13329
13330 // Add 4 to each active LHS lane. It will not affect any existing 0xff
13331 // or 0x0c.
13332 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
13333 SDLoc DL(N);
13334
13335 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13336 RHS.getOperand(0),
13337 DAG.getConstant(Sel, DL, MVT::i32));
13338 }
13339 }
13340 }
13341
13342 return SDValue();
13343}
13344
13345// A key component of v_perm is a mapping between byte position of the src
13346// operands, and the byte position of the dest. To provide such, we need: 1. the
13347// node that provides x byte of the dest of the OR, and 2. the byte of the node
13348// used to provide that x byte. calculateByteProvider finds which node provides
13349// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
13350// and finds an ultimate src and byte position For example: The supported
13351// LoadCombine pattern for vector loads is as follows
13352// t1
13353// or
13354// / \
13355// t2 t3
13356// zext shl
13357// | | \
13358// t4 t5 16
13359// or anyext
13360// / \ |
13361// t6 t7 t8
13362// srl shl or
13363// / | / \ / \
13364// t9 t10 t11 t12 t13 t14
13365// trunc* 8 trunc* 8 and and
13366// | | / | | \
13367// t15 t16 t17 t18 t19 t20
13368// trunc* 255 srl -256
13369// | / \
13370// t15 t15 16
13371//
13372// *In this example, the truncs are from i32->i16
13373//
13374// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
13375// respectively. calculateSrcByte would find (given node) -> ultimate src &
13376// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
13377// After finding the mapping, we can combine the tree into vperm t15, t16,
13378// 0x05000407
13379
13380// Find the source and byte position from a node.
13381// \p DestByte is the byte position of the dest of the or that the src
13382// ultimately provides. \p SrcIndex is the byte of the src that maps to this
13383// dest of the or byte. \p Depth tracks how many recursive iterations we have
13384// performed.
13385static const std::optional<ByteProvider<SDValue>>
13386calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
13387 unsigned Depth = 0) {
13388 // We may need to recursively traverse a series of SRLs
13389 if (Depth >= 6)
13390 return std::nullopt;
13391
13392 if (Op.getValueSizeInBits() < 8)
13393 return std::nullopt;
13394
13395 if (Op.getValueType().isVector())
13396 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13397
13398 switch (Op->getOpcode()) {
13399 case ISD::TRUNCATE: {
13400 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13401 }
13402
13403 case ISD::SIGN_EXTEND:
13404 case ISD::ZERO_EXTEND:
13406 SDValue NarrowOp = Op->getOperand(0);
13407 auto NarrowVT = NarrowOp.getValueType();
13408 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
13409 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13410 NarrowVT = VTSign->getVT();
13411 }
13412 if (!NarrowVT.isByteSized())
13413 return std::nullopt;
13414 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
13415
13416 if (SrcIndex >= NarrowByteWidth)
13417 return std::nullopt;
13418 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13419 }
13420
13421 case ISD::SRA:
13422 case ISD::SRL: {
13423 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13424 if (!ShiftOp)
13425 return std::nullopt;
13426
13427 uint64_t BitShift = ShiftOp->getZExtValue();
13428
13429 if (BitShift % 8 != 0)
13430 return std::nullopt;
13431
13432 SrcIndex += BitShift / 8;
13433
13434 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
13435 }
13436
13437 default: {
13438 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
13439 }
13440 }
13441 llvm_unreachable("fully handled switch");
13442}
13443
13444// For a byte position in the result of an Or, traverse the tree and find the
13445// node (and the byte of the node) which ultimately provides this {Or,
13446// BytePosition}. \p Op is the operand we are currently examining. \p Index is
13447// the byte position of the Op that corresponds with the originally requested
13448// byte of the Or \p Depth tracks how many recursive iterations we have
13449// performed. \p StartingIndex is the originally requested byte of the Or
13450static const std::optional<ByteProvider<SDValue>>
13451calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
13452 unsigned StartingIndex = 0) {
13453 // Finding Src tree of RHS of or typically requires at least 1 additional
13454 // depth
13455 if (Depth > 6)
13456 return std::nullopt;
13457
13458 unsigned BitWidth = Op.getScalarValueSizeInBits();
13459 if (BitWidth % 8 != 0)
13460 return std::nullopt;
13461 if (Index > BitWidth / 8 - 1)
13462 return std::nullopt;
13463
13464 bool IsVec = Op.getValueType().isVector();
13465 switch (Op.getOpcode()) {
13466 case ISD::OR: {
13467 if (IsVec)
13468 return std::nullopt;
13469
13470 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
13471 StartingIndex);
13472 if (!RHS)
13473 return std::nullopt;
13474 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13475 StartingIndex);
13476 if (!LHS)
13477 return std::nullopt;
13478 // A well formed Or will have two ByteProviders for each byte, one of which
13479 // is constant zero
13480 if (!LHS->isConstantZero() && !RHS->isConstantZero())
13481 return std::nullopt;
13482 if (!LHS || LHS->isConstantZero())
13483 return RHS;
13484 if (!RHS || RHS->isConstantZero())
13485 return LHS;
13486 return std::nullopt;
13487 }
13488
13489 case ISD::AND: {
13490 if (IsVec)
13491 return std::nullopt;
13492
13493 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13494 if (!BitMaskOp)
13495 return std::nullopt;
13496
13497 uint32_t BitMask = BitMaskOp->getZExtValue();
13498 // Bits we expect for our StartingIndex
13499 uint32_t IndexMask = 0xFF << (Index * 8);
13500
13501 if ((IndexMask & BitMask) != IndexMask) {
13502 // If the result of the and partially provides the byte, then it
13503 // is not well formatted
13504 if (IndexMask & BitMask)
13505 return std::nullopt;
13507 }
13508
13509 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
13510 }
13511
13512 case ISD::FSHR: {
13513 if (IsVec)
13514 return std::nullopt;
13515
13516 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
13517 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13518 if (!ShiftOp || Op.getValueType().isVector())
13519 return std::nullopt;
13520
13521 uint64_t BitsProvided = Op.getValueSizeInBits();
13522 if (BitsProvided % 8 != 0)
13523 return std::nullopt;
13524
13525 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13526 if (BitShift % 8)
13527 return std::nullopt;
13528
13529 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13530 uint64_t ByteShift = BitShift / 8;
13531
13532 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13533 uint64_t BytesProvided = BitsProvided / 8;
13534 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13535 NewIndex %= BytesProvided;
13536 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
13537 }
13538
13539 case ISD::SRA:
13540 case ISD::SRL: {
13541 if (IsVec)
13542 return std::nullopt;
13543
13544 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13545 if (!ShiftOp)
13546 return std::nullopt;
13547
13548 uint64_t BitShift = ShiftOp->getZExtValue();
13549 if (BitShift % 8)
13550 return std::nullopt;
13551
13552 auto BitsProvided = Op.getScalarValueSizeInBits();
13553 if (BitsProvided % 8 != 0)
13554 return std::nullopt;
13555
13556 uint64_t BytesProvided = BitsProvided / 8;
13557 uint64_t ByteShift = BitShift / 8;
13558 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
13559 // If the byte we are trying to provide (as tracked by index) falls in this
13560 // range, then the SRL provides the byte. The byte of interest of the src of
13561 // the SRL is Index + ByteShift
13562 return BytesProvided - ByteShift > Index
13563 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
13564 Index + ByteShift)
13566 }
13567
13568 case ISD::SHL: {
13569 if (IsVec)
13570 return std::nullopt;
13571
13572 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13573 if (!ShiftOp)
13574 return std::nullopt;
13575
13576 uint64_t BitShift = ShiftOp->getZExtValue();
13577 if (BitShift % 8 != 0)
13578 return std::nullopt;
13579 uint64_t ByteShift = BitShift / 8;
13580
13581 // If we are shifting by an amount greater than (or equal to)
13582 // the index we are trying to provide, then it provides 0s. If not,
13583 // then this bytes are not definitively 0s, and the corresponding byte
13584 // of interest is Index - ByteShift of the src
13585 return Index < ByteShift
13587 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
13588 Depth + 1, StartingIndex);
13589 }
13590 case ISD::ANY_EXTEND:
13591 case ISD::SIGN_EXTEND:
13592 case ISD::ZERO_EXTEND:
13594 case ISD::AssertZext:
13595 case ISD::AssertSext: {
13596 if (IsVec)
13597 return std::nullopt;
13598
13599 SDValue NarrowOp = Op->getOperand(0);
13600 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
13601 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
13602 Op->getOpcode() == ISD::AssertZext ||
13603 Op->getOpcode() == ISD::AssertSext) {
13604 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
13605 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13606 }
13607 if (NarrowBitWidth % 8 != 0)
13608 return std::nullopt;
13609 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13610
13611 if (Index >= NarrowByteWidth)
13612 return Op.getOpcode() == ISD::ZERO_EXTEND
13613 ? std::optional<ByteProvider<SDValue>>(
13615 : std::nullopt;
13616 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
13617 }
13618
13619 case ISD::TRUNCATE: {
13620 if (IsVec)
13621 return std::nullopt;
13622
13623 uint64_t NarrowByteWidth = BitWidth / 8;
13624
13625 if (NarrowByteWidth >= Index) {
13626 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
13627 StartingIndex);
13628 }
13629
13630 return std::nullopt;
13631 }
13632
13633 case ISD::CopyFromReg: {
13634 if (BitWidth / 8 > Index)
13635 return calculateSrcByte(Op, StartingIndex, Index);
13636
13637 return std::nullopt;
13638 }
13639
13640 case ISD::LOAD: {
13641 auto *L = cast<LoadSDNode>(Op.getNode());
13642
13643 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13644 if (NarrowBitWidth % 8 != 0)
13645 return std::nullopt;
13646 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13647
13648 // If the width of the load does not reach byte we are trying to provide for
13649 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
13650 // question
13651 if (Index >= NarrowByteWidth) {
13652 return L->getExtensionType() == ISD::ZEXTLOAD
13653 ? std::optional<ByteProvider<SDValue>>(
13655 : std::nullopt;
13656 }
13657
13658 if (NarrowByteWidth > Index) {
13659 return calculateSrcByte(Op, StartingIndex, Index);
13660 }
13661
13662 return std::nullopt;
13663 }
13664
13665 case ISD::BSWAP: {
13666 if (IsVec)
13667 return std::nullopt;
13668
13669 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
13670 Depth + 1, StartingIndex);
13671 }
13672
13674 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
13675 if (!IdxOp)
13676 return std::nullopt;
13677 auto VecIdx = IdxOp->getZExtValue();
13678 auto ScalarSize = Op.getScalarValueSizeInBits();
13679 if (ScalarSize < 32)
13680 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13681 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
13682 StartingIndex, Index);
13683 }
13684
13685 case AMDGPUISD::PERM: {
13686 if (IsVec)
13687 return std::nullopt;
13688
13689 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
13690 if (!PermMask)
13691 return std::nullopt;
13692
13693 auto IdxMask =
13694 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13695 if (IdxMask > 0x07 && IdxMask != 0x0c)
13696 return std::nullopt;
13697
13698 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13699 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13700
13701 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
13704 }
13705
13706 default: {
13707 return std::nullopt;
13708 }
13709 }
13710
13711 llvm_unreachable("fully handled switch");
13712}
13713
13714// Returns true if the Operand is a scalar and is 16 bits
13715static bool isExtendedFrom16Bits(SDValue &Operand) {
13716
13717 switch (Operand.getOpcode()) {
13718 case ISD::ANY_EXTEND:
13719 case ISD::SIGN_EXTEND:
13720 case ISD::ZERO_EXTEND: {
13721 auto OpVT = Operand.getOperand(0).getValueType();
13722 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
13723 }
13724 case ISD::LOAD: {
13725 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
13726 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
13727 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
13728 ExtType == ISD::EXTLOAD) {
13729 auto MemVT = L->getMemoryVT();
13730 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
13731 }
13732 return L->getMemoryVT().getSizeInBits() == 16;
13733 }
13734 default:
13735 return false;
13736 }
13737}
13738
13739// Returns true if the mask matches consecutive bytes, and the first byte
13740// begins at a power of 2 byte offset from 0th byte
13741static bool addresses16Bits(int Mask) {
13742 int Low8 = Mask & 0xff;
13743 int Hi8 = (Mask & 0xff00) >> 8;
13744
13745 assert(Low8 < 8 && Hi8 < 8);
13746 // Are the bytes contiguous in the order of increasing addresses.
13747 bool IsConsecutive = (Hi8 - Low8 == 1);
13748 // Is the first byte at location that is aligned for 16 bit instructions.
13749 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
13750 // In this case, we still need code to extract the 16 bit operand, so it
13751 // is better to use i8 v_perm
13752 bool Is16Aligned = !(Low8 % 2);
13753
13754 return IsConsecutive && Is16Aligned;
13755}
13756
13757// Do not lower into v_perm if the operands are actually 16 bit
13758// and the selected bits (based on PermMask) correspond with two
13759// easily addressable 16 bit operands.
13761 SDValue &OtherOp) {
13762 int Low16 = PermMask & 0xffff;
13763 int Hi16 = (PermMask & 0xffff0000) >> 16;
13764
13765 auto TempOp = peekThroughBitcasts(Op);
13766 auto TempOtherOp = peekThroughBitcasts(OtherOp);
13767
13768 auto OpIs16Bit =
13769 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
13770 if (!OpIs16Bit)
13771 return true;
13772
13773 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13774 isExtendedFrom16Bits(TempOtherOp);
13775 if (!OtherOpIs16Bit)
13776 return true;
13777
13778 // Do we cleanly address both
13779 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
13780}
13781
13783 unsigned DWordOffset) {
13784 SDValue Ret;
13785
13786 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
13787 // ByteProvider must be at least 8 bits
13788 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13789
13790 if (TypeSize <= 32)
13791 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
13792
13793 if (Src.getValueType().isVector()) {
13794 auto ScalarTySize = Src.getScalarValueSizeInBits();
13795 auto ScalarTy = Src.getValueType().getScalarType();
13796 if (ScalarTySize == 32) {
13797 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
13798 DAG.getConstant(DWordOffset, SL, MVT::i32));
13799 }
13800 if (ScalarTySize > 32) {
13801 Ret = DAG.getNode(
13802 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
13803 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13804 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13805 if (ShiftVal)
13806 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
13807 DAG.getConstant(ShiftVal, SL, MVT::i32));
13808 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13809 }
13810
13811 assert(ScalarTySize < 32);
13812 auto NumElements = TypeSize / ScalarTySize;
13813 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13814 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13815 auto NumElementsIn32 = 32 / ScalarTySize;
13816 auto NumAvailElements = DWordOffset < Trunc32Elements
13817 ? NumElementsIn32
13818 : NumElements - NormalizedTrunc;
13819
13821 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
13822 NumAvailElements);
13823
13824 Ret = DAG.getBuildVector(
13825 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
13826 VecSrcs);
13827 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13828 }
13829
13830 /// Scalar Type
13831 auto ShiftVal = 32 * DWordOffset;
13832 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
13833 DAG.getConstant(ShiftVal, SL, MVT::i32));
13834 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
13835}
13836
13838 SelectionDAG &DAG = DCI.DAG;
13839 [[maybe_unused]] EVT VT = N->getValueType(0);
13841
13842 // VT is known to be MVT::i32, so we need to provide 4 bytes.
13843 assert(VT == MVT::i32);
13844 for (int i = 0; i < 4; i++) {
13845 // Find the ByteProvider that provides the ith byte of the result of OR
13846 std::optional<ByteProvider<SDValue>> P =
13847 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
13848 // TODO support constantZero
13849 if (!P || P->isConstantZero())
13850 return SDValue();
13851
13852 PermNodes.push_back(*P);
13853 }
13854 if (PermNodes.size() != 4)
13855 return SDValue();
13856
13857 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13858 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13859 uint64_t PermMask = 0x00000000;
13860 for (size_t i = 0; i < PermNodes.size(); i++) {
13861 auto PermOp = PermNodes[i];
13862 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
13863 // by sizeof(Src2) = 4
13864 int SrcByteAdjust = 4;
13865
13866 // If the Src uses a byte from a different DWORD, then it corresponds
13867 // with a difference source
13868 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13869 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13870 if (SecondSrc)
13871 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13872 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13873 return SDValue();
13874
13875 // Set the index of the second distinct Src node
13876 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13877 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13878 SrcByteAdjust = 0;
13879 }
13880 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13882 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13883 }
13884 SDLoc DL(N);
13885 SDValue Op = *PermNodes[FirstSrc.first].Src;
13886 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
13887 assert(Op.getValueSizeInBits() == 32);
13888
13889 // Check that we are not just extracting the bytes in order from an op
13890 if (!SecondSrc) {
13891 int Low16 = PermMask & 0xffff;
13892 int Hi16 = (PermMask & 0xffff0000) >> 16;
13893
13894 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13895 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13896
13897 // The perm op would really just produce Op. So combine into Op
13898 if (WellFormedLow && WellFormedHi)
13899 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
13900 }
13901
13902 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
13903
13904 if (SecondSrc) {
13905 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
13906 assert(OtherOp.getValueSizeInBits() == 32);
13907 }
13908
13909 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
13910
13911 assert(Op.getValueType().isByteSized() &&
13912 OtherOp.getValueType().isByteSized());
13913
13914 // If the ultimate src is less than 32 bits, then we will only be
13915 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
13916 // CalculateByteProvider would not have returned Op as source if we
13917 // used a byte that is outside its ValueType. Thus, we are free to
13918 // ANY_EXTEND as the extended bits are dont-cares.
13919 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
13920 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
13921
13922 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
13923 DAG.getConstant(PermMask, DL, MVT::i32));
13924 }
13925 return SDValue();
13926}
13927
13928SDValue SITargetLowering::performOrCombine(SDNode *N,
13929 DAGCombinerInfo &DCI) const {
13930 SelectionDAG &DAG = DCI.DAG;
13931 SDValue LHS = N->getOperand(0);
13932 SDValue RHS = N->getOperand(1);
13933
13934 EVT VT = N->getValueType(0);
13935 if (VT == MVT::i1) {
13936 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
13937 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13938 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
13939 SDValue Src = LHS.getOperand(0);
13940 if (Src != RHS.getOperand(0))
13941 return SDValue();
13942
13943 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
13944 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
13945 if (!CLHS || !CRHS)
13946 return SDValue();
13947
13948 // Only 10 bits are used.
13949 static const uint32_t MaxMask = 0x3ff;
13950
13951 uint32_t NewMask =
13952 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
13953 SDLoc DL(N);
13954 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
13955 DAG.getConstant(NewMask, DL, MVT::i32));
13956 }
13957
13958 return SDValue();
13959 }
13960
13961 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
13963 LHS.getOpcode() == AMDGPUISD::PERM &&
13964 isa<ConstantSDNode>(LHS.getOperand(2))) {
13965 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
13966 if (!Sel)
13967 return SDValue();
13968
13969 Sel |= LHS.getConstantOperandVal(2);
13970 SDLoc DL(N);
13971 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
13972 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
13973 }
13974
13975 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
13976 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
13977 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
13978 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13979
13980 // If all the uses of an or need to extract the individual elements, do not
13981 // attempt to lower into v_perm
13982 auto usesCombinedOperand = [](SDNode *OrUse) {
13983 // If we have any non-vectorized use, then it is a candidate for v_perm
13984 if (OrUse->getOpcode() != ISD::BITCAST ||
13985 !OrUse->getValueType(0).isVector())
13986 return true;
13987
13988 // If we have any non-vectorized use, then it is a candidate for v_perm
13989 for (auto *VUser : OrUse->users()) {
13990 if (!VUser->getValueType(0).isVector())
13991 return true;
13992
13993 // If the use of a vector is a store, then combining via a v_perm
13994 // is beneficial.
13995 // TODO -- whitelist more uses
13996 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
13997 if (VUser->getOpcode() == VectorwiseOp)
13998 return true;
13999 }
14000 return false;
14001 };
14002
14003 if (!any_of(N->users(), usesCombinedOperand))
14004 return SDValue();
14005
14006 uint32_t LHSMask = getPermuteMask(LHS);
14007 uint32_t RHSMask = getPermuteMask(RHS);
14008
14009 if (LHSMask != ~0u && RHSMask != ~0u) {
14010 // Canonicalize the expression in an attempt to have fewer unique masks
14011 // and therefore fewer registers used to hold the masks.
14012 if (LHSMask > RHSMask) {
14013 std::swap(LHSMask, RHSMask);
14014 std::swap(LHS, RHS);
14015 }
14016
14017 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14018 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14019 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14020 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14021
14022 // Check of we need to combine values from two sources within a byte.
14023 if (!(LHSUsedLanes & RHSUsedLanes) &&
14024 // If we select high and lower word keep it for SDWA.
14025 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14026 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14027 // Kill zero bytes selected by other mask. Zero value is 0xc.
14028 LHSMask &= ~RHSUsedLanes;
14029 RHSMask &= ~LHSUsedLanes;
14030 // Add 4 to each active LHS lane
14031 LHSMask |= LHSUsedLanes & 0x04040404;
14032 // Combine masks
14033 uint32_t Sel = LHSMask | RHSMask;
14034 SDLoc DL(N);
14035
14036 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14037 RHS.getOperand(0),
14038 DAG.getConstant(Sel, DL, MVT::i32));
14039 }
14040 }
14041 if (LHSMask == ~0u || RHSMask == ~0u) {
14042 if (SDValue Perm = matchPERM(N, DCI))
14043 return Perm;
14044 }
14045 }
14046
14047 // Detect identity v2i32 OR and replace with identity source node.
14048 // Specifically an Or that has operands constructed from the same source node
14049 // via extract_vector_elt and build_vector. I.E.
14050 // v2i32 or(
14051 // v2i32 build_vector(
14052 // i32 extract_elt(%IdentitySrc, 0),
14053 // i32 0
14054 // ),
14055 // v2i32 build_vector(
14056 // i32 0,
14057 // i32 extract_elt(%IdentitySrc, 1)
14058 // ) )
14059 // =>
14060 // v2i32 %IdentitySrc
14061
14062 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
14063 RHS->getOpcode() == ISD::BUILD_VECTOR) {
14064
14065 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
14066 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
14067
14068 // Test for and normalise build vectors.
14069 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
14070
14071 // Get the extract_vector_element operands.
14072 SDValue LEVE = LHS->getOperand(0);
14073 SDValue REVE = RHS->getOperand(1);
14074
14075 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14077 // Check that different elements from the same vector are
14078 // extracted.
14079 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
14080 LEVE->getOperand(1) != REVE->getOperand(1)) {
14081 SDValue IdentitySrc = LEVE.getOperand(0);
14082 return IdentitySrc;
14083 }
14084 }
14085 }
14086 }
14087
14088 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14089 return SDValue();
14090
14091 // TODO: This could be a generic combine with a predicate for extracting the
14092 // high half of an integer being free.
14093
14094 // (or i64:x, (zero_extend i32:y)) ->
14095 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
14096 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
14097 RHS.getOpcode() != ISD::ZERO_EXTEND)
14098 std::swap(LHS, RHS);
14099
14100 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
14101 SDValue ExtSrc = RHS.getOperand(0);
14102 EVT SrcVT = ExtSrc.getValueType();
14103 if (SrcVT == MVT::i32) {
14104 SDLoc SL(N);
14105 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
14106 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
14107
14108 DCI.AddToWorklist(LowOr.getNode());
14109 DCI.AddToWorklist(HiBits.getNode());
14110
14111 SDValue Vec =
14112 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
14113 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14114 }
14115 }
14116
14117 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
14118 if (CRHS) {
14119 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
14120 N->getOperand(0), CRHS))
14121 return Split;
14122 }
14123
14124 return SDValue();
14125}
14126
14127SDValue SITargetLowering::performXorCombine(SDNode *N,
14128 DAGCombinerInfo &DCI) const {
14129 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
14130 return RV;
14131
14132 SDValue LHS = N->getOperand(0);
14133 SDValue RHS = N->getOperand(1);
14134
14135 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
14136 SelectionDAG &DAG = DCI.DAG;
14137
14138 EVT VT = N->getValueType(0);
14139 if (CRHS && VT == MVT::i64) {
14140 if (SDValue Split =
14141 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
14142 return Split;
14143 }
14144
14145 // v2i32 (xor (vselect cc, x, y), K) ->
14146 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
14147 // replaced with source modifiers when the select is lowered to CNDMASK.
14148 unsigned Opc = LHS.getOpcode();
14149 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
14150 (Opc == ISD::SELECT && VT == MVT::i64)) &&
14151 CRHS && CRHS->getAPIntValue().isSignMask()) {
14152 SDValue CC = LHS->getOperand(0);
14153 SDValue TRUE = LHS->getOperand(1);
14154 SDValue FALSE = LHS->getOperand(2);
14155 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
14156 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
14157 SDValue XSelect =
14158 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
14159 return XSelect;
14160 }
14161
14162 // Make sure to apply the 64-bit constant splitting fold before trying to fold
14163 // fneg-like xors into 64-bit select.
14164 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
14165 // This looks like an fneg, try to fold as a source modifier.
14166 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
14168 // xor (select c, a, b), 0x80000000 ->
14169 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
14170 SDLoc DL(N);
14171 SDValue CastLHS =
14172 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
14173 SDValue CastRHS =
14174 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
14175 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
14176 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
14177 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
14178 LHS->getOperand(0), FNegLHS, FNegRHS);
14179 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
14180 }
14181 }
14182
14183 return SDValue();
14184}
14185
14186SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
14187 DAGCombinerInfo &DCI) const {
14188 if (!Subtarget->has16BitInsts() ||
14189 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14190 return SDValue();
14191
14192 EVT VT = N->getValueType(0);
14193 if (VT != MVT::i32)
14194 return SDValue();
14195
14196 SDValue Src = N->getOperand(0);
14197 if (Src.getValueType() != MVT::i16)
14198 return SDValue();
14199
14200 return SDValue();
14201}
14202
14203SDValue
14204SITargetLowering::performSignExtendInRegCombine(SDNode *N,
14205 DAGCombinerInfo &DCI) const {
14206 SDValue Src = N->getOperand(0);
14207 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
14208
14209 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
14210 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
14211 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14212 VTSign->getVT() == MVT::i8) ||
14213 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14214 VTSign->getVT() == MVT::i16))) {
14215 assert(Subtarget->hasScalarSubwordLoads() &&
14216 "s_buffer_load_{u8, i8} are supported "
14217 "in GFX12 (or newer) architectures.");
14218 EVT VT = Src.getValueType();
14219 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14222 SDLoc DL(N);
14223 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14224 SDValue Ops[] = {
14225 Src.getOperand(0), // source register
14226 Src.getOperand(1), // offset
14227 Src.getOperand(2) // cachePolicy
14228 };
14229 auto *M = cast<MemSDNode>(Src);
14230 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14231 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14232 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
14233 return LoadVal;
14234 }
14235 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14236 VTSign->getVT() == MVT::i8) ||
14237 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14238 VTSign->getVT() == MVT::i16)) &&
14239 Src.hasOneUse()) {
14240 auto *M = cast<MemSDNode>(Src);
14241 SDValue Ops[] = {Src.getOperand(0), // Chain
14242 Src.getOperand(1), // rsrc
14243 Src.getOperand(2), // vindex
14244 Src.getOperand(3), // voffset
14245 Src.getOperand(4), // soffset
14246 Src.getOperand(5), // offset
14247 Src.getOperand(6), Src.getOperand(7)};
14248 // replace with BUFFER_LOAD_BYTE/SHORT
14249 SDVTList ResList =
14250 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14251 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14254 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14255 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
14256 return DCI.DAG.getMergeValues(
14257 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
14258 }
14259 return SDValue();
14260}
14261
14262SDValue SITargetLowering::performClassCombine(SDNode *N,
14263 DAGCombinerInfo &DCI) const {
14264 SelectionDAG &DAG = DCI.DAG;
14265 SDValue Mask = N->getOperand(1);
14266
14267 // fp_class x, 0 -> false
14268 if (isNullConstant(Mask))
14269 return DAG.getConstant(0, SDLoc(N), MVT::i1);
14270
14271 if (N->getOperand(0).isUndef())
14272 return DAG.getUNDEF(MVT::i1);
14273
14274 return SDValue();
14275}
14276
14277SDValue SITargetLowering::performRcpCombine(SDNode *N,
14278 DAGCombinerInfo &DCI) const {
14279 EVT VT = N->getValueType(0);
14280 SDValue N0 = N->getOperand(0);
14281
14282 if (N0.isUndef()) {
14283 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
14284 SDLoc(N), VT);
14285 }
14286
14287 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
14288 N0.getOpcode() == ISD::SINT_TO_FP)) {
14289 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
14290 N->getFlags());
14291 }
14292
14293 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
14294 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
14295 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
14296 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
14297 N->getFlags());
14298 }
14299
14301}
14302
14304 unsigned MaxDepth) const {
14305 unsigned Opcode = Op.getOpcode();
14306 if (Opcode == ISD::FCANONICALIZE)
14307 return true;
14308
14309 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14310 const auto &F = CFP->getValueAPF();
14311 if (F.isNaN() && F.isSignaling())
14312 return false;
14313 if (!F.isDenormal())
14314 return true;
14315
14316 DenormalMode Mode =
14317 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
14318 return Mode == DenormalMode::getIEEE();
14319 }
14320
14321 // If source is a result of another standard FP operation it is already in
14322 // canonical form.
14323 if (MaxDepth == 0)
14324 return false;
14325
14326 switch (Opcode) {
14327 // These will flush denorms if required.
14328 case ISD::FADD:
14329 case ISD::FSUB:
14330 case ISD::FMUL:
14331 case ISD::FCEIL:
14332 case ISD::FFLOOR:
14333 case ISD::FMA:
14334 case ISD::FMAD:
14335 case ISD::FSQRT:
14336 case ISD::FDIV:
14337 case ISD::FREM:
14338 case ISD::FP_ROUND:
14339 case ISD::FP_EXTEND:
14340 case ISD::FP16_TO_FP:
14341 case ISD::FP_TO_FP16:
14342 case ISD::BF16_TO_FP:
14343 case ISD::FP_TO_BF16:
14344 case ISD::FLDEXP:
14347 case AMDGPUISD::RCP:
14348 case AMDGPUISD::RSQ:
14352 case AMDGPUISD::LOG:
14353 case AMDGPUISD::EXP:
14357 case AMDGPUISD::FRACT:
14364 case AMDGPUISD::SIN_HW:
14365 case AMDGPUISD::COS_HW:
14366 return true;
14367
14368 // It can/will be lowered or combined as a bit operation.
14369 // Need to check their input recursively to handle.
14370 case ISD::FNEG:
14371 case ISD::FABS:
14372 case ISD::FCOPYSIGN:
14373 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14374
14375 case ISD::AND:
14376 if (Op.getValueType() == MVT::i32) {
14377 // Be careful as we only know it is a bitcast floating point type. It
14378 // could be f32, v2f16, we have no way of knowing. Luckily the constant
14379 // value that we optimize for, which comes up in fp32 to bf16 conversions,
14380 // is valid to optimize for all types.
14381 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
14382 if (RHS->getZExtValue() == 0xffff0000) {
14383 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14384 }
14385 }
14386 }
14387 break;
14388
14389 case ISD::FSIN:
14390 case ISD::FCOS:
14391 case ISD::FSINCOS:
14392 return Op.getValueType().getScalarType() != MVT::f16;
14393
14394 case ISD::FMINNUM:
14395 case ISD::FMAXNUM:
14396 case ISD::FMINNUM_IEEE:
14397 case ISD::FMAXNUM_IEEE:
14398 case ISD::FMINIMUM:
14399 case ISD::FMAXIMUM:
14400 case ISD::FMINIMUMNUM:
14401 case ISD::FMAXIMUMNUM:
14402 case AMDGPUISD::CLAMP:
14403 case AMDGPUISD::FMED3:
14404 case AMDGPUISD::FMAX3:
14405 case AMDGPUISD::FMIN3:
14407 case AMDGPUISD::FMINIMUM3: {
14408 // FIXME: Shouldn't treat the generic operations different based these.
14409 // However, we aren't really required to flush the result from
14410 // minnum/maxnum..
14411
14412 // snans will be quieted, so we only need to worry about denormals.
14413 if (Subtarget->supportsMinMaxDenormModes() ||
14414 // FIXME: denormalsEnabledForType is broken for dynamic
14415 denormalsEnabledForType(DAG, Op.getValueType()))
14416 return true;
14417
14418 // Flushing may be required.
14419 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
14420 // targets need to check their input recursively.
14421
14422 // FIXME: Does this apply with clamp? It's implemented with max.
14423 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
14424 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
14425 return false;
14426 }
14427
14428 return true;
14429 }
14430 case ISD::SELECT: {
14431 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
14432 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
14433 }
14434 case ISD::BUILD_VECTOR: {
14435 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
14436 SDValue SrcOp = Op.getOperand(i);
14437 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
14438 return false;
14439 }
14440
14441 return true;
14442 }
14445 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14446 }
14448 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
14449 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
14450 }
14451 case ISD::UNDEF:
14452 // Could be anything.
14453 return false;
14454
14455 case ISD::BITCAST:
14456 // TODO: This is incorrect as it loses track of the operand's type. We may
14457 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
14458 // same bits that are canonicalized in one type need not be in the other.
14459 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
14460 case ISD::TRUNCATE: {
14461 // Hack round the mess we make when legalizing extract_vector_elt
14462 if (Op.getValueType() == MVT::i16) {
14463 SDValue TruncSrc = Op.getOperand(0);
14464 if (TruncSrc.getValueType() == MVT::i32 &&
14465 TruncSrc.getOpcode() == ISD::BITCAST &&
14466 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
14467 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
14468 }
14469 }
14470 return false;
14471 }
14473 unsigned IntrinsicID = Op.getConstantOperandVal(0);
14474 // TODO: Handle more intrinsics
14475 switch (IntrinsicID) {
14476 case Intrinsic::amdgcn_cvt_pkrtz:
14477 case Intrinsic::amdgcn_cubeid:
14478 case Intrinsic::amdgcn_frexp_mant:
14479 case Intrinsic::amdgcn_fdot2:
14480 case Intrinsic::amdgcn_rcp:
14481 case Intrinsic::amdgcn_rsq:
14482 case Intrinsic::amdgcn_rsq_clamp:
14483 case Intrinsic::amdgcn_rcp_legacy:
14484 case Intrinsic::amdgcn_rsq_legacy:
14485 case Intrinsic::amdgcn_trig_preop:
14486 case Intrinsic::amdgcn_tanh:
14487 case Intrinsic::amdgcn_log:
14488 case Intrinsic::amdgcn_exp2:
14489 case Intrinsic::amdgcn_sqrt:
14490 return true;
14491 default:
14492 break;
14493 }
14494
14495 break;
14496 }
14497 default:
14498 break;
14499 }
14500
14501 // FIXME: denormalsEnabledForType is broken for dynamic
14502 return denormalsEnabledForType(DAG, Op.getValueType()) &&
14503 DAG.isKnownNeverSNaN(Op);
14504}
14505
14507 unsigned MaxDepth) const {
14508 const MachineRegisterInfo &MRI = MF.getRegInfo();
14509 MachineInstr *MI = MRI.getVRegDef(Reg);
14510 unsigned Opcode = MI->getOpcode();
14511
14512 if (Opcode == AMDGPU::G_FCANONICALIZE)
14513 return true;
14514
14515 std::optional<FPValueAndVReg> FCR;
14516 // Constant splat (can be padded with undef) or scalar constant.
14518 if (FCR->Value.isSignaling())
14519 return false;
14520 if (!FCR->Value.isDenormal())
14521 return true;
14522
14523 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
14524 return Mode == DenormalMode::getIEEE();
14525 }
14526
14527 if (MaxDepth == 0)
14528 return false;
14529
14530 switch (Opcode) {
14531 case AMDGPU::G_FADD:
14532 case AMDGPU::G_FSUB:
14533 case AMDGPU::G_FMUL:
14534 case AMDGPU::G_FCEIL:
14535 case AMDGPU::G_FFLOOR:
14536 case AMDGPU::G_FRINT:
14537 case AMDGPU::G_FNEARBYINT:
14538 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14539 case AMDGPU::G_INTRINSIC_TRUNC:
14540 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14541 case AMDGPU::G_FMA:
14542 case AMDGPU::G_FMAD:
14543 case AMDGPU::G_FSQRT:
14544 case AMDGPU::G_FDIV:
14545 case AMDGPU::G_FREM:
14546 case AMDGPU::G_FPOW:
14547 case AMDGPU::G_FPEXT:
14548 case AMDGPU::G_FLOG:
14549 case AMDGPU::G_FLOG2:
14550 case AMDGPU::G_FLOG10:
14551 case AMDGPU::G_FPTRUNC:
14552 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14553 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14554 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14555 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14556 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14557 return true;
14558 case AMDGPU::G_FNEG:
14559 case AMDGPU::G_FABS:
14560 case AMDGPU::G_FCOPYSIGN:
14561 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
14562 case AMDGPU::G_FMINNUM:
14563 case AMDGPU::G_FMAXNUM:
14564 case AMDGPU::G_FMINNUM_IEEE:
14565 case AMDGPU::G_FMAXNUM_IEEE:
14566 case AMDGPU::G_FMINIMUM:
14567 case AMDGPU::G_FMAXIMUM:
14568 case AMDGPU::G_FMINIMUMNUM:
14569 case AMDGPU::G_FMAXIMUMNUM: {
14570 if (Subtarget->supportsMinMaxDenormModes() ||
14571 // FIXME: denormalsEnabledForType is broken for dynamic
14572 denormalsEnabledForType(MRI.getType(Reg), MF))
14573 return true;
14574
14575 [[fallthrough]];
14576 }
14577 case AMDGPU::G_BUILD_VECTOR:
14578 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
14579 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
14580 return false;
14581 return true;
14582 case AMDGPU::G_INTRINSIC:
14583 case AMDGPU::G_INTRINSIC_CONVERGENT:
14584 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
14585 case Intrinsic::amdgcn_fmul_legacy:
14586 case Intrinsic::amdgcn_fmad_ftz:
14587 case Intrinsic::amdgcn_sqrt:
14588 case Intrinsic::amdgcn_fmed3:
14589 case Intrinsic::amdgcn_sin:
14590 case Intrinsic::amdgcn_cos:
14591 case Intrinsic::amdgcn_log:
14592 case Intrinsic::amdgcn_exp2:
14593 case Intrinsic::amdgcn_log_clamp:
14594 case Intrinsic::amdgcn_rcp:
14595 case Intrinsic::amdgcn_rcp_legacy:
14596 case Intrinsic::amdgcn_rsq:
14597 case Intrinsic::amdgcn_rsq_clamp:
14598 case Intrinsic::amdgcn_rsq_legacy:
14599 case Intrinsic::amdgcn_div_scale:
14600 case Intrinsic::amdgcn_div_fmas:
14601 case Intrinsic::amdgcn_div_fixup:
14602 case Intrinsic::amdgcn_fract:
14603 case Intrinsic::amdgcn_cvt_pkrtz:
14604 case Intrinsic::amdgcn_cubeid:
14605 case Intrinsic::amdgcn_cubema:
14606 case Intrinsic::amdgcn_cubesc:
14607 case Intrinsic::amdgcn_cubetc:
14608 case Intrinsic::amdgcn_frexp_mant:
14609 case Intrinsic::amdgcn_fdot2:
14610 case Intrinsic::amdgcn_trig_preop:
14611 case Intrinsic::amdgcn_tanh:
14612 return true;
14613 default:
14614 break;
14615 }
14616
14617 [[fallthrough]];
14618 default:
14619 return false;
14620 }
14621
14622 llvm_unreachable("invalid operation");
14623}
14624
14625// Constant fold canonicalize.
14626SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
14627 const SDLoc &SL, EVT VT,
14628 const APFloat &C) const {
14629 // Flush denormals to 0 if not enabled.
14630 if (C.isDenormal()) {
14631 DenormalMode Mode =
14632 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
14633 if (Mode == DenormalMode::getPreserveSign()) {
14634 return DAG.getConstantFP(
14635 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
14636 }
14637
14638 if (Mode != DenormalMode::getIEEE())
14639 return SDValue();
14640 }
14641
14642 if (C.isNaN()) {
14643 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
14644 if (C.isSignaling()) {
14645 // Quiet a signaling NaN.
14646 // FIXME: Is this supposed to preserve payload bits?
14647 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14648 }
14649
14650 // Make sure it is the canonical NaN bitpattern.
14651 //
14652 // TODO: Can we use -1 as the canonical NaN value since it's an inline
14653 // immediate?
14654 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
14655 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
14656 }
14657
14658 // Already canonical.
14659 return DAG.getConstantFP(C, SL, VT);
14660}
14661
14663 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
14664}
14665
14666SDValue
14667SITargetLowering::performFCanonicalizeCombine(SDNode *N,
14668 DAGCombinerInfo &DCI) const {
14669 SelectionDAG &DAG = DCI.DAG;
14670 SDValue N0 = N->getOperand(0);
14671 EVT VT = N->getValueType(0);
14672
14673 // fcanonicalize undef -> qnan
14674 if (N0.isUndef()) {
14676 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
14677 }
14678
14679 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
14680 EVT VT = N->getValueType(0);
14681 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
14682 }
14683
14684 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
14685 // (fcanonicalize k)
14686 //
14687 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
14688
14689 // TODO: This could be better with wider vectors that will be split to v2f16,
14690 // and to consider uses since there aren't that many packed operations.
14691 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
14692 isTypeLegal(MVT::v2f16)) {
14693 SDLoc SL(N);
14694 SDValue NewElts[2];
14695 SDValue Lo = N0.getOperand(0);
14696 SDValue Hi = N0.getOperand(1);
14697 EVT EltVT = Lo.getValueType();
14698
14700 for (unsigned I = 0; I != 2; ++I) {
14701 SDValue Op = N0.getOperand(I);
14702 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
14703 NewElts[I] =
14704 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14705 } else if (Op.isUndef()) {
14706 // Handled below based on what the other operand is.
14707 NewElts[I] = Op;
14708 } else {
14709 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
14710 }
14711 }
14712
14713 // If one half is undef, and one is constant, prefer a splat vector rather
14714 // than the normal qNaN. If it's a register, prefer 0.0 since that's
14715 // cheaper to use and may be free with a packed operation.
14716 if (NewElts[0].isUndef()) {
14717 if (isa<ConstantFPSDNode>(NewElts[1]))
14718 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
14719 ? NewElts[1]
14720 : DAG.getConstantFP(0.0f, SL, EltVT);
14721 }
14722
14723 if (NewElts[1].isUndef()) {
14724 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
14725 ? NewElts[0]
14726 : DAG.getConstantFP(0.0f, SL, EltVT);
14727 }
14728
14729 return DAG.getBuildVector(VT, SL, NewElts);
14730 }
14731 }
14732
14733 return SDValue();
14734}
14735
14736static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
14737 switch (Opc) {
14738 case ISD::FMAXNUM:
14739 case ISD::FMAXNUM_IEEE:
14740 case ISD::FMAXIMUMNUM:
14741 return AMDGPUISD::FMAX3;
14742 case ISD::FMAXIMUM:
14743 return AMDGPUISD::FMAXIMUM3;
14744 case ISD::SMAX:
14745 return AMDGPUISD::SMAX3;
14746 case ISD::UMAX:
14747 return AMDGPUISD::UMAX3;
14748 case ISD::FMINNUM:
14749 case ISD::FMINNUM_IEEE:
14750 case ISD::FMINIMUMNUM:
14751 return AMDGPUISD::FMIN3;
14752 case ISD::FMINIMUM:
14753 return AMDGPUISD::FMINIMUM3;
14754 case ISD::SMIN:
14755 return AMDGPUISD::SMIN3;
14756 case ISD::UMIN:
14757 return AMDGPUISD::UMIN3;
14758 default:
14759 llvm_unreachable("Not a min/max opcode");
14760 }
14761}
14762
14763SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
14764 const SDLoc &SL, SDValue Src,
14765 SDValue MinVal,
14766 SDValue MaxVal,
14767 bool Signed) const {
14768
14769 // med3 comes from
14770 // min(max(x, K0), K1), K0 < K1
14771 // max(min(x, K0), K1), K1 < K0
14772 //
14773 // "MinVal" and "MaxVal" respectively refer to the rhs of the
14774 // min/max op.
14775 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
14776 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
14777
14778 if (!MinK || !MaxK)
14779 return SDValue();
14780
14781 if (Signed) {
14782 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
14783 return SDValue();
14784 } else {
14785 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
14786 return SDValue();
14787 }
14788
14789 EVT VT = MinK->getValueType(0);
14790 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14791 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14792 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14793
14794 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
14795 // not available, but this is unlikely to be profitable as constants
14796 // will often need to be materialized & extended, especially on
14797 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
14798 return SDValue();
14799}
14800
14803 return C;
14804
14806 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
14807 return C;
14808 }
14809
14810 return nullptr;
14811}
14812
14813SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
14814 const SDLoc &SL, SDValue Op0,
14815 SDValue Op1) const {
14816 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
14817 if (!K1)
14818 return SDValue();
14819
14820 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
14821 if (!K0)
14822 return SDValue();
14823
14824 // Ordered >= (although NaN inputs should have folded away by now).
14825 if (K0->getValueAPF() > K1->getValueAPF())
14826 return SDValue();
14827
14828 // med3 with a nan input acts like
14829 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
14830 //
14831 // So the result depends on whether the IEEE mode bit is enabled or not with a
14832 // signaling nan input.
14833 // ieee=1
14834 // s0 snan: yields s2
14835 // s1 snan: yields s2
14836 // s2 snan: qnan
14837
14838 // s0 qnan: min(s1, s2)
14839 // s1 qnan: min(s0, s2)
14840 // s2 qnan: min(s0, s1)
14841
14842 // ieee=0
14843 // s0 snan: min(s1, s2)
14844 // s1 snan: min(s0, s2)
14845 // s2 snan: qnan
14846
14847 // s0 qnan: min(s1, s2)
14848 // s1 qnan: min(s0, s2)
14849 // s2 qnan: min(s0, s1)
14850 const MachineFunction &MF = DAG.getMachineFunction();
14851 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
14852
14853 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
14854 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
14855 // can only form if op0 is fmaxnum_ieee if IEEE=1.
14856 EVT VT = Op0.getValueType();
14857 if (Info->getMode().DX10Clamp) {
14858 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
14859 // hardware fmed3 behavior converting to a min.
14860 // FIXME: Should this be allowing -0.0?
14861 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
14862 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
14863 }
14864
14865 // med3 for f16 is only available on gfx9+, and not available for v2f16.
14866 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14867 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
14868 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
14869 // then give the other result, which is different from med3 with a NaN
14870 // input.
14871 SDValue Var = Op0.getOperand(0);
14872 if (!DAG.isKnownNeverSNaN(Var))
14873 return SDValue();
14874
14875 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14876
14877 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
14878 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
14879 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
14880 SDValue(K0, 0), SDValue(K1, 0));
14881 }
14882 }
14883
14884 return SDValue();
14885}
14886
14887/// \return true if the subtarget supports minimum3 and maximum3 with the given
14888/// base min/max opcode \p Opc for type \p VT.
14889static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
14890 EVT VT) {
14891 switch (Opc) {
14892 case ISD::FMINNUM:
14893 case ISD::FMAXNUM:
14894 case ISD::FMINNUM_IEEE:
14895 case ISD::FMAXNUM_IEEE:
14896 case ISD::FMINIMUMNUM:
14897 case ISD::FMAXIMUMNUM:
14900 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
14901 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
14902 case ISD::FMINIMUM:
14903 case ISD::FMAXIMUM:
14904 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
14905 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
14906 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
14907 case ISD::SMAX:
14908 case ISD::SMIN:
14909 case ISD::UMAX:
14910 case ISD::UMIN:
14911 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
14912 default:
14913 return false;
14914 }
14915
14916 llvm_unreachable("not a min/max opcode");
14917}
14918
14919SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
14920 DAGCombinerInfo &DCI) const {
14921 SelectionDAG &DAG = DCI.DAG;
14922
14923 EVT VT = N->getValueType(0);
14924 unsigned Opc = N->getOpcode();
14925 SDValue Op0 = N->getOperand(0);
14926 SDValue Op1 = N->getOperand(1);
14927
14928 // Only do this if the inner op has one use since this will just increases
14929 // register pressure for no benefit.
14930
14931 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
14932 // max(max(a, b), c) -> max3(a, b, c)
14933 // min(min(a, b), c) -> min3(a, b, c)
14934 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
14935 SDLoc DL(N);
14936 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14937 Op0.getOperand(0), Op0.getOperand(1), Op1);
14938 }
14939
14940 // Try commuted.
14941 // max(a, max(b, c)) -> max3(a, b, c)
14942 // min(a, min(b, c)) -> min3(a, b, c)
14943 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
14944 SDLoc DL(N);
14945 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
14946 Op0, Op1.getOperand(0), Op1.getOperand(1));
14947 }
14948 }
14949
14950 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
14951 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
14952 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
14953 if (SDValue Med3 = performIntMed3ImmCombine(
14954 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
14955 return Med3;
14956 }
14957 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
14958 if (SDValue Med3 = performIntMed3ImmCombine(
14959 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
14960 return Med3;
14961 }
14962
14963 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
14964 if (SDValue Med3 = performIntMed3ImmCombine(
14965 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
14966 return Med3;
14967 }
14968 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
14969 if (SDValue Med3 = performIntMed3ImmCombine(
14970 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
14971 return Med3;
14972 }
14973
14974 // if !is_snan(x):
14975 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14976 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14977 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14978 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
14979 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
14980 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
14981 (Opc == ISD::FMINIMUMNUM && Op0.getOpcode() == ISD::FMAXIMUMNUM) ||
14983 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
14984 (VT == MVT::f32 || VT == MVT::f64 ||
14985 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
14986 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
14987 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
14988 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
14989 Op0.hasOneUse()) {
14990 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
14991 return Res;
14992 }
14993
14994 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
14995 // for some types, but at a higher cost since it's implemented with a 3
14996 // operand form.
14997 const SDNodeFlags Flags = N->getFlags();
14998 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) &&
14999 !Subtarget->hasIEEEMinimumMaximumInsts() && Flags.hasNoNaNs()) {
15000 unsigned NewOpc =
15001 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15002 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
15003 }
15004
15005 return SDValue();
15006}
15007
15011 // FIXME: Should this be allowing -0.0?
15012 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15013 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15014 }
15015 }
15016
15017 return false;
15018}
15019
15020// FIXME: Should only worry about snans for version with chain.
15021SDValue SITargetLowering::performFMed3Combine(SDNode *N,
15022 DAGCombinerInfo &DCI) const {
15023 EVT VT = N->getValueType(0);
15024 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
15025 // NaNs. With a NaN input, the order of the operands may change the result.
15026
15027 SelectionDAG &DAG = DCI.DAG;
15028 SDLoc SL(N);
15029
15030 SDValue Src0 = N->getOperand(0);
15031 SDValue Src1 = N->getOperand(1);
15032 SDValue Src2 = N->getOperand(2);
15033
15034 if (isClampZeroToOne(Src0, Src1)) {
15035 // const_a, const_b, x -> clamp is safe in all cases including signaling
15036 // nans.
15037 // FIXME: Should this be allowing -0.0?
15038 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15039 }
15040
15041 const MachineFunction &MF = DAG.getMachineFunction();
15042 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
15043
15044 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
15045 // handling no dx10-clamp?
15046 if (Info->getMode().DX10Clamp) {
15047 // If NaNs is clamped to 0, we are free to reorder the inputs.
15048
15049 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15050 std::swap(Src0, Src1);
15051
15052 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
15053 std::swap(Src1, Src2);
15054
15055 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
15056 std::swap(Src0, Src1);
15057
15058 if (isClampZeroToOne(Src1, Src2))
15059 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15060 }
15061
15062 return SDValue();
15063}
15064
15065SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
15066 DAGCombinerInfo &DCI) const {
15067 SDValue Src0 = N->getOperand(0);
15068 SDValue Src1 = N->getOperand(1);
15069 if (Src0.isUndef() && Src1.isUndef())
15070 return DCI.DAG.getUNDEF(N->getValueType(0));
15071 return SDValue();
15072}
15073
15074// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
15075// expanded into a set of cmp/select instructions.
15077 unsigned NumElem,
15078 bool IsDivergentIdx,
15079 const GCNSubtarget *Subtarget) {
15081 return false;
15082
15083 unsigned VecSize = EltSize * NumElem;
15084
15085 // Sub-dword vectors of size 2 dword or less have better implementation.
15086 if (VecSize <= 64 && EltSize < 32)
15087 return false;
15088
15089 // Always expand the rest of sub-dword instructions, otherwise it will be
15090 // lowered via memory.
15091 if (EltSize < 32)
15092 return true;
15093
15094 // Always do this if var-idx is divergent, otherwise it will become a loop.
15095 if (IsDivergentIdx)
15096 return true;
15097
15098 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
15099 unsigned NumInsts = NumElem /* Number of compares */ +
15100 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
15101
15102 // On some architectures (GFX9) movrel is not available and it's better
15103 // to expand.
15104 if (Subtarget->useVGPRIndexMode())
15105 return NumInsts <= 16;
15106
15107 // If movrel is available, use it instead of expanding for vector of 8
15108 // elements.
15109 if (Subtarget->hasMovrel())
15110 return NumInsts <= 15;
15111
15112 return true;
15113}
15114
15116 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
15117 if (isa<ConstantSDNode>(Idx))
15118 return false;
15119
15120 SDValue Vec = N->getOperand(0);
15121 EVT VecVT = Vec.getValueType();
15122 EVT EltVT = VecVT.getVectorElementType();
15123 unsigned EltSize = EltVT.getSizeInBits();
15124 unsigned NumElem = VecVT.getVectorNumElements();
15125
15127 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
15128}
15129
15130SDValue
15131SITargetLowering::performExtractVectorEltCombine(SDNode *N,
15132 DAGCombinerInfo &DCI) const {
15133 SDValue Vec = N->getOperand(0);
15134 SelectionDAG &DAG = DCI.DAG;
15135
15136 EVT VecVT = Vec.getValueType();
15137 EVT VecEltVT = VecVT.getVectorElementType();
15138 EVT ResVT = N->getValueType(0);
15139
15140 unsigned VecSize = VecVT.getSizeInBits();
15141 unsigned VecEltSize = VecEltVT.getSizeInBits();
15142
15143 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
15145 SDLoc SL(N);
15146 SDValue Idx = N->getOperand(1);
15147 SDValue Elt =
15148 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
15149 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
15150 }
15151
15152 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
15153 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
15154 // There are optimisations to transform 64-bit shifts into 32-bit shifts
15155 // depending on the shift operand. See e.g. performSraCombine().
15156 // This combine ensures that the optimisation is compatible with v2i32
15157 // legalised AND.
15158 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
15159 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
15160
15162 if (!C || C->getZExtValue() != 0x1f)
15163 return SDValue();
15164
15165 SDLoc SL(N);
15166 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
15167 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
15168 Vec->getOperand(0), N->getOperand(1));
15169 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
15170 DAG.ReplaceAllUsesWith(N, A.getNode());
15171 }
15172
15173 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
15174 // =>
15175 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
15176 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
15177 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
15178 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15179 SDLoc SL(N);
15180 SDValue Idx = N->getOperand(1);
15181 unsigned Opc = Vec.getOpcode();
15182
15183 switch (Opc) {
15184 default:
15185 break;
15186 // TODO: Support other binary operations.
15187 case ISD::FADD:
15188 case ISD::FSUB:
15189 case ISD::FMUL:
15190 case ISD::ADD:
15191 case ISD::UMIN:
15192 case ISD::UMAX:
15193 case ISD::SMIN:
15194 case ISD::SMAX:
15195 case ISD::FMAXNUM:
15196 case ISD::FMINNUM:
15197 case ISD::FMAXNUM_IEEE:
15198 case ISD::FMINNUM_IEEE:
15199 case ISD::FMAXIMUM:
15200 case ISD::FMINIMUM: {
15201 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15202 Vec.getOperand(0), Idx);
15203 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
15204 Vec.getOperand(1), Idx);
15205
15206 DCI.AddToWorklist(Elt0.getNode());
15207 DCI.AddToWorklist(Elt1.getNode());
15208 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
15209 }
15210 }
15211 }
15212
15213 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
15215 SDLoc SL(N);
15216 SDValue Idx = N->getOperand(1);
15217 SDValue V;
15218 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15219 SDValue IC = DAG.getVectorIdxConstant(I, SL);
15220 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
15221 if (I == 0)
15222 V = Elt;
15223 else
15224 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
15225 }
15226 return V;
15227 }
15228
15229 if (!DCI.isBeforeLegalize())
15230 return SDValue();
15231
15232 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
15233 // elements. This exposes more load reduction opportunities by replacing
15234 // multiple small extract_vector_elements with a single 32-bit extract.
15235 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
15236 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
15237 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15238 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
15239
15240 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15241 unsigned EltIdx = BitIndex / 32;
15242 unsigned LeftoverBitIdx = BitIndex % 32;
15243 SDLoc SL(N);
15244
15245 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
15246 DCI.AddToWorklist(Cast.getNode());
15247
15248 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
15249 DAG.getConstant(EltIdx, SL, MVT::i32));
15250 DCI.AddToWorklist(Elt.getNode());
15251 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
15252 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
15253 DCI.AddToWorklist(Srl.getNode());
15254
15255 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
15256 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
15257 DCI.AddToWorklist(Trunc.getNode());
15258
15259 if (VecEltVT == ResVT) {
15260 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15261 }
15262
15263 assert(ResVT.isScalarInteger());
15264 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
15265 }
15266
15267 return SDValue();
15268}
15269
15270SDValue
15271SITargetLowering::performInsertVectorEltCombine(SDNode *N,
15272 DAGCombinerInfo &DCI) const {
15273 SDValue Vec = N->getOperand(0);
15274 SDValue Idx = N->getOperand(2);
15275 EVT VecVT = Vec.getValueType();
15276 EVT EltVT = VecVT.getVectorElementType();
15277
15278 // INSERT_VECTOR_ELT (<n x e>, var-idx)
15279 // => BUILD_VECTOR n x select (e, const-idx)
15281 return SDValue();
15282
15283 SelectionDAG &DAG = DCI.DAG;
15284 SDLoc SL(N);
15285 SDValue Ins = N->getOperand(1);
15286 EVT IdxVT = Idx.getValueType();
15287
15289 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
15290 SDValue IC = DAG.getConstant(I, SL, IdxVT);
15291 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
15292 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
15293 Ops.push_back(V);
15294 }
15295
15296 return DAG.getBuildVector(VecVT, SL, Ops);
15297}
15298
15299/// Return the source of an fp_extend from f16 to f32, or a converted FP
15300/// constant.
15302 if (Src.getOpcode() == ISD::FP_EXTEND &&
15303 Src.getOperand(0).getValueType() == MVT::f16) {
15304 return Src.getOperand(0);
15305 }
15306
15307 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
15308 APFloat Val = CFP->getValueAPF();
15309 bool LosesInfo = true;
15311 if (!LosesInfo)
15312 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
15313 }
15314
15315 return SDValue();
15316}
15317
15318SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
15319 DAGCombinerInfo &DCI) const {
15320 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15321 "combine only useful on gfx8");
15322
15323 SDValue TruncSrc = N->getOperand(0);
15324 EVT VT = N->getValueType(0);
15325 if (VT != MVT::f16)
15326 return SDValue();
15327
15328 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
15329 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
15330 return SDValue();
15331
15332 SelectionDAG &DAG = DCI.DAG;
15333 SDLoc SL(N);
15334
15335 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
15336 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
15337 // casting back.
15338
15339 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
15340 // fmin(fmax(a, b), fmax(fmin(a, b), c))
15341 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
15342 if (!A)
15343 return SDValue();
15344
15345 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
15346 if (!B)
15347 return SDValue();
15348
15349 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
15350 if (!C)
15351 return SDValue();
15352
15353 // This changes signaling nan behavior. If an input is a signaling nan, it
15354 // would have been quieted by the fpext originally. We don't care because
15355 // these are unconstrained ops. If we needed to insert quieting canonicalizes
15356 // we would be worse off than just doing the promotion.
15357 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
15358 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
15359 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
15360 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15361}
15362
15363unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
15364 const SDNode *N0,
15365 const SDNode *N1) const {
15366 EVT VT = N0->getValueType(0);
15367
15368 // Only do this if we are not trying to support denormals. v_mad_f32 does not
15369 // support denormals ever.
15370 if (((VT == MVT::f32 &&
15372 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15375 return ISD::FMAD;
15376
15377 const TargetOptions &Options = DAG.getTarget().Options;
15378 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
15379 (N0->getFlags().hasAllowContract() &&
15380 N1->getFlags().hasAllowContract())) &&
15382 return ISD::FMA;
15383 }
15384
15385 return 0;
15386}
15387
15388// For a reassociatable opcode perform:
15389// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
15390SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
15391 SelectionDAG &DAG) const {
15392 EVT VT = N->getValueType(0);
15393 if (VT != MVT::i32 && VT != MVT::i64)
15394 return SDValue();
15395
15396 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
15397 return SDValue();
15398
15399 unsigned Opc = N->getOpcode();
15400 SDValue Op0 = N->getOperand(0);
15401 SDValue Op1 = N->getOperand(1);
15402
15403 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
15404 return SDValue();
15405
15406 if (Op0->isDivergent())
15407 std::swap(Op0, Op1);
15408
15409 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
15410 return SDValue();
15411
15412 SDValue Op2 = Op1.getOperand(1);
15413 Op1 = Op1.getOperand(0);
15414 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
15415 return SDValue();
15416
15417 if (Op1->isDivergent())
15418 std::swap(Op1, Op2);
15419
15420 SDLoc SL(N);
15421 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
15422 return DAG.getNode(Opc, SL, VT, Add1, Op2);
15423}
15424
15425static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
15426 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
15428 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
15429 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
15430 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
15431}
15432
15433// Fold
15434// y = lshr i64 x, 32
15435// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
15436// with Const.hi == -1
15437// To
15438// res = mad_u64_u32 y.lo ,Const.lo, x.lo
15440 SDValue MulLHS, SDValue MulRHS,
15441 SDValue AddRHS) {
15442 if (MulRHS.getOpcode() == ISD::SRL)
15443 std::swap(MulLHS, MulRHS);
15444
15445 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
15446 return SDValue();
15447
15448 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
15449 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
15450 MulLHS.getOperand(0) != AddRHS)
15451 return SDValue();
15452
15454 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
15455 return SDValue();
15456
15457 SDValue ConstMul =
15458 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
15459 return getMad64_32(DAG, SL, MVT::i64,
15460 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
15461 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
15462}
15463
15464// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
15465// multiplies, if any.
15466//
15467// Full 64-bit multiplies that feed into an addition are lowered here instead
15468// of using the generic expansion. The generic expansion ends up with
15469// a tree of ADD nodes that prevents us from using the "add" part of the
15470// MAD instruction. The expansion produced here results in a chain of ADDs
15471// instead of a tree.
15472SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
15473 DAGCombinerInfo &DCI) const {
15474 assert(N->isAnyAdd());
15475
15476 SelectionDAG &DAG = DCI.DAG;
15477 EVT VT = N->getValueType(0);
15478 SDLoc SL(N);
15479 SDValue LHS = N->getOperand(0);
15480 SDValue RHS = N->getOperand(1);
15481
15482 if (VT.isVector())
15483 return SDValue();
15484
15485 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
15486 // result in scalar registers for uniform values.
15487 if (!N->isDivergent() && Subtarget->hasSMulHi())
15488 return SDValue();
15489
15490 unsigned NumBits = VT.getScalarSizeInBits();
15491 if (NumBits <= 32 || NumBits > 64)
15492 return SDValue();
15493
15494 if (LHS.getOpcode() != ISD::MUL) {
15495 assert(RHS.getOpcode() == ISD::MUL);
15496 std::swap(LHS, RHS);
15497 }
15498
15499 // Avoid the fold if it would unduly increase the number of multiplies due to
15500 // multiple uses, except on hardware with full-rate multiply-add (which is
15501 // part of full-rate 64-bit ops).
15502 if (!Subtarget->hasFullRate64Ops()) {
15503 unsigned NumUsers = 0;
15504 for (SDNode *User : LHS->users()) {
15505 // There is a use that does not feed into addition, so the multiply can't
15506 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
15507 if (!User->isAnyAdd())
15508 return SDValue();
15509
15510 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
15511 // MUL + 3xADD + 3xADDC over 3xMAD.
15512 ++NumUsers;
15513 if (NumUsers >= 3)
15514 return SDValue();
15515 }
15516 }
15517
15518 SDValue MulLHS = LHS.getOperand(0);
15519 SDValue MulRHS = LHS.getOperand(1);
15520 SDValue AddRHS = RHS;
15521
15522 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
15523 return FoldedMAD;
15524
15525 // Always check whether operands are small unsigned values, since that
15526 // knowledge is useful in more cases. Check for small signed values only if
15527 // doing so can unlock a shorter code sequence.
15528 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
15529 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
15530
15531 bool MulSignedLo = false;
15532 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15533 MulSignedLo =
15534 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
15535 }
15536
15537 // The operands and final result all have the same number of bits. If
15538 // operands need to be extended, they can be extended with garbage. The
15539 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
15540 // truncated away in the end.
15541 if (VT != MVT::i64) {
15542 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
15543 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
15544 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
15545 }
15546
15547 // The basic code generated is conceptually straightforward. Pseudo code:
15548 //
15549 // accum = mad_64_32 lhs.lo, rhs.lo, accum
15550 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
15551 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
15552 //
15553 // The second and third lines are optional, depending on whether the factors
15554 // are {sign,zero}-extended or not.
15555 //
15556 // The actual DAG is noisier than the pseudo code, but only due to
15557 // instructions that disassemble values into low and high parts, and
15558 // assemble the final result.
15559 SDValue One = DAG.getConstant(1, SL, MVT::i32);
15560
15561 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
15562 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
15563 SDValue Accum =
15564 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15565
15566 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15567 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15568
15569 if (!MulLHSUnsigned32) {
15570 auto MulLHSHi =
15571 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
15572 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
15573 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15574 }
15575
15576 if (!MulRHSUnsigned32) {
15577 auto MulRHSHi =
15578 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
15579 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
15580 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
15581 }
15582
15583 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
15584 Accum = DAG.getBitcast(MVT::i64, Accum);
15585 }
15586
15587 if (VT != MVT::i64)
15588 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
15589 return Accum;
15590}
15591
15592SDValue
15593SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
15594 DAGCombinerInfo &DCI) const {
15595 SDValue RHS = N->getOperand(1);
15596 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
15597 if (!CRHS)
15598 return SDValue();
15599
15600 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
15601 // common.
15602 uint64_t Val = CRHS->getZExtValue();
15603 if (countr_zero(Val) >= 32) {
15604 SelectionDAG &DAG = DCI.DAG;
15605 SDLoc SL(N);
15606 SDValue LHS = N->getOperand(0);
15607
15608 // Avoid carry machinery if we know the low half of the add does not
15609 // contribute to the final result.
15610 //
15611 // add i64:x, K if computeTrailingZeros(K) >= 32
15612 // => build_pair (add x.hi, K.hi), x.lo
15613
15614 // Breaking the 64-bit add here with this strange constant is unlikely
15615 // to interfere with addressing mode patterns.
15616
15617 SDValue Hi = getHiHalf64(LHS, DAG);
15618 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
15619 unsigned Opcode = N->getOpcode();
15620 if (Opcode == ISD::PTRADD)
15621 Opcode = ISD::ADD;
15622 SDValue AddHi =
15623 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
15624
15625 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
15626 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
15627 }
15628
15629 return SDValue();
15630}
15631
15632// Collect the ultimate src of each of the mul node's operands, and confirm
15633// each operand is 8 bytes.
15634static std::optional<ByteProvider<SDValue>>
15635handleMulOperand(const SDValue &MulOperand) {
15636 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
15637 if (!Byte0 || Byte0->isConstantZero()) {
15638 return std::nullopt;
15639 }
15640 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
15641 if (Byte1 && !Byte1->isConstantZero()) {
15642 return std::nullopt;
15643 }
15644 return Byte0;
15645}
15646
15647static unsigned addPermMasks(unsigned First, unsigned Second) {
15648 unsigned FirstCs = First & 0x0c0c0c0c;
15649 unsigned SecondCs = Second & 0x0c0c0c0c;
15650 unsigned FirstNoCs = First & ~0x0c0c0c0c;
15651 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15652
15653 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15654 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15655 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15656 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15657
15658 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15659}
15660
15661struct DotSrc {
15663 int64_t PermMask;
15665};
15666
15670 SmallVectorImpl<DotSrc> &Src1s, int Step) {
15671
15672 assert(Src0.Src.has_value() && Src1.Src.has_value());
15673 // Src0s and Src1s are empty, just place arbitrarily.
15674 if (Step == 0) {
15675 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
15676 Src0.SrcOffset / 4});
15677 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
15678 Src1.SrcOffset / 4});
15679 return;
15680 }
15681
15682 for (int BPI = 0; BPI < 2; BPI++) {
15683 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
15684 if (BPI == 1) {
15685 BPP = {Src1, Src0};
15686 }
15687 unsigned ZeroMask = 0x0c0c0c0c;
15688 unsigned FMask = 0xFF << (8 * (3 - Step));
15689
15690 unsigned FirstMask =
15691 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15692 unsigned SecondMask =
15693 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15694 // Attempt to find Src vector which contains our SDValue, if so, add our
15695 // perm mask to the existing one. If we are unable to find a match for the
15696 // first SDValue, attempt to find match for the second.
15697 int FirstGroup = -1;
15698 for (int I = 0; I < 2; I++) {
15699 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
15700 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
15701 return IterElt.SrcOp == *BPP.first.Src &&
15702 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15703 };
15704
15705 auto *Match = llvm::find_if(Srcs, MatchesFirst);
15706 if (Match != Srcs.end()) {
15707 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
15708 FirstGroup = I;
15709 break;
15710 }
15711 }
15712 if (FirstGroup != -1) {
15713 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
15714 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
15715 return IterElt.SrcOp == *BPP.second.Src &&
15716 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15717 };
15718 auto *Match = llvm::find_if(Srcs, MatchesSecond);
15719 if (Match != Srcs.end()) {
15720 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
15721 } else
15722 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15723 return;
15724 }
15725 }
15726
15727 // If we have made it here, then we could not find a match in Src0s or Src1s
15728 // for either Src0 or Src1, so just place them arbitrarily.
15729
15730 unsigned ZeroMask = 0x0c0c0c0c;
15731 unsigned FMask = 0xFF << (8 * (3 - Step));
15732
15733 Src0s.push_back(
15734 {*Src0.Src,
15735 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15736 Src0.SrcOffset / 4});
15737 Src1s.push_back(
15738 {*Src1.Src,
15739 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15740 Src1.SrcOffset / 4});
15741}
15742
15744 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
15745 bool IsAny) {
15746
15747 // If we just have one source, just permute it accordingly.
15748 if (Srcs.size() == 1) {
15749 auto *Elt = Srcs.begin();
15750 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
15751
15752 // v_perm will produce the original value
15753 if (Elt->PermMask == 0x3020100)
15754 return EltOp;
15755
15756 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15757 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
15758 }
15759
15760 auto *FirstElt = Srcs.begin();
15761 auto *SecondElt = std::next(FirstElt);
15762
15764
15765 // If we have multiple sources in the chain, combine them via perms (using
15766 // calculated perm mask) and Ors.
15767 while (true) {
15768 auto FirstMask = FirstElt->PermMask;
15769 auto SecondMask = SecondElt->PermMask;
15770
15771 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15772 unsigned FirstPlusFour = FirstMask | 0x04040404;
15773 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
15774 // original 0x0C.
15775 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15776
15777 auto PermMask = addPermMasks(FirstMask, SecondMask);
15778 auto FirstVal =
15779 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15780 auto SecondVal =
15781 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
15782
15783 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
15784 SecondVal,
15785 DAG.getConstant(PermMask, SL, MVT::i32)));
15786
15787 FirstElt = std::next(SecondElt);
15788 if (FirstElt == Srcs.end())
15789 break;
15790
15791 SecondElt = std::next(FirstElt);
15792 // If we only have a FirstElt, then just combine that into the cumulative
15793 // source node.
15794 if (SecondElt == Srcs.end()) {
15795 auto EltOp =
15796 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
15797
15798 Perms.push_back(
15799 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15800 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
15801 break;
15802 }
15803 }
15804
15805 assert(Perms.size() == 1 || Perms.size() == 2);
15806 return Perms.size() == 2
15807 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
15808 : Perms[0];
15809}
15810
15811static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
15812 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15813 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15814 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15815 EntryMask += ZeroMask;
15816 }
15817}
15818
15819static bool isMul(const SDValue Op) {
15820 auto Opcode = Op.getOpcode();
15821
15822 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15823 Opcode == AMDGPUISD::MUL_I24);
15824}
15825
15826static std::optional<bool>
15828 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
15829 const SDValue &S1Op, const SelectionDAG &DAG) {
15830 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
15831 // of the dot4 is irrelevant.
15832 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
15833 return false;
15834
15835 auto Known0 = DAG.computeKnownBits(S0Op, 0);
15836 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
15837 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15838 auto Known1 = DAG.computeKnownBits(S1Op, 0);
15839 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
15840 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15841
15842 assert(!(S0IsUnsigned && S0IsSigned));
15843 assert(!(S1IsUnsigned && S1IsSigned));
15844
15845 // There are 9 possible permutations of
15846 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
15847
15848 // In two permutations, the sign bits are known to be the same for both Ops,
15849 // so simply return Signed / Unsigned corresponding to the MSB
15850
15851 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15852 return S0IsSigned;
15853
15854 // In another two permutations, the sign bits are known to be opposite. In
15855 // this case return std::nullopt to indicate a bad match.
15856
15857 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15858 return std::nullopt;
15859
15860 // In the remaining five permutations, we don't know the value of the sign
15861 // bit for at least one Op. Since we have a valid ByteProvider, we know that
15862 // the upper bits must be extension bits. Thus, the only ways for the sign
15863 // bit to be unknown is if it was sign extended from unknown value, or if it
15864 // was any extended. In either case, it is correct to use the signed
15865 // version of the signedness semantics of dot4
15866
15867 // In two of such permutations, we known the sign bit is set for
15868 // one op, and the other is unknown. It is okay to used signed version of
15869 // dot4.
15870 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15871 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15872 return true;
15873
15874 // In one such permutation, we don't know either of the sign bits. It is okay
15875 // to used the signed version of dot4.
15876 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15877 return true;
15878
15879 // In two of such permutations, we known the sign bit is unset for
15880 // one op, and the other is unknown. Return std::nullopt to indicate a
15881 // bad match.
15882 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15883 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15884 return std::nullopt;
15885
15886 llvm_unreachable("Fully covered condition");
15887}
15888
15889SDValue SITargetLowering::performAddCombine(SDNode *N,
15890 DAGCombinerInfo &DCI) const {
15891 SelectionDAG &DAG = DCI.DAG;
15892 EVT VT = N->getValueType(0);
15893 SDLoc SL(N);
15894 SDValue LHS = N->getOperand(0);
15895 SDValue RHS = N->getOperand(1);
15896
15897 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
15898 if (Subtarget->hasMad64_32()) {
15899 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
15900 return Folded;
15901 }
15902 }
15903
15904 if (SDValue V = reassociateScalarOps(N, DAG)) {
15905 return V;
15906 }
15907
15908 if (VT == MVT::i64) {
15909 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
15910 return Folded;
15911 }
15912
15913 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
15914 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15915 SDValue TempNode(N, 0);
15916 std::optional<bool> IsSigned;
15920
15921 // Match the v_dot4 tree, while collecting src nodes.
15922 int ChainLength = 0;
15923 for (int I = 0; I < 4; I++) {
15924 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
15925 if (MulIdx == -1)
15926 break;
15927 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15928 if (!Src0)
15929 break;
15930 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15931 if (!Src1)
15932 break;
15933
15934 auto IterIsSigned = checkDot4MulSignedness(
15935 TempNode->getOperand(MulIdx), *Src0, *Src1,
15936 TempNode->getOperand(MulIdx)->getOperand(0),
15937 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15938 if (!IterIsSigned)
15939 break;
15940 if (!IsSigned)
15941 IsSigned = *IterIsSigned;
15942 if (*IterIsSigned != *IsSigned)
15943 break;
15944 placeSources(*Src0, *Src1, Src0s, Src1s, I);
15945 auto AddIdx = 1 - MulIdx;
15946 // Allow the special case where add (add (mul24, 0), mul24) became ->
15947 // add (mul24, mul24).
15948 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
15949 Src2s.push_back(TempNode->getOperand(AddIdx));
15950 auto Src0 =
15951 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
15952 if (!Src0)
15953 break;
15954 auto Src1 =
15955 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
15956 if (!Src1)
15957 break;
15958 auto IterIsSigned = checkDot4MulSignedness(
15959 TempNode->getOperand(AddIdx), *Src0, *Src1,
15960 TempNode->getOperand(AddIdx)->getOperand(0),
15961 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15962 if (!IterIsSigned)
15963 break;
15964 assert(IsSigned);
15965 if (*IterIsSigned != *IsSigned)
15966 break;
15967 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
15968 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
15969 ChainLength = I + 2;
15970 break;
15971 }
15972
15973 TempNode = TempNode->getOperand(AddIdx);
15974 Src2s.push_back(TempNode);
15975 ChainLength = I + 1;
15976 if (TempNode->getNumOperands() < 2)
15977 break;
15978 LHS = TempNode->getOperand(0);
15979 RHS = TempNode->getOperand(1);
15980 }
15981
15982 if (ChainLength < 2)
15983 return SDValue();
15984
15985 // Masks were constructed with assumption that we would find a chain of
15986 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
15987 // 0x0c) so they do not affect dot calculation.
15988 if (ChainLength < 4) {
15989 fixMasks(Src0s, ChainLength);
15990 fixMasks(Src1s, ChainLength);
15991 }
15992
15993 SDValue Src0, Src1;
15994
15995 // If we are just using a single source for both, and have permuted the
15996 // bytes consistently, we can just use the sources without permuting
15997 // (commutation).
15998 bool UseOriginalSrc = false;
15999 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
16000 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
16001 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
16002 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
16003 SmallVector<unsigned, 4> SrcBytes;
16004 auto Src0Mask = Src0s.begin()->PermMask;
16005 SrcBytes.push_back(Src0Mask & 0xFF000000);
16006 bool UniqueEntries = true;
16007 for (auto I = 1; I < 4; I++) {
16008 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
16009
16010 if (is_contained(SrcBytes, NextByte)) {
16011 UniqueEntries = false;
16012 break;
16013 }
16014 SrcBytes.push_back(NextByte);
16015 }
16016
16017 if (UniqueEntries) {
16018 UseOriginalSrc = true;
16019
16020 auto *FirstElt = Src0s.begin();
16021 auto FirstEltOp =
16022 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
16023
16024 auto *SecondElt = Src1s.begin();
16025 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
16026 SecondElt->DWordOffset);
16027
16028 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
16029 MVT::getIntegerVT(32));
16030 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
16031 MVT::getIntegerVT(32));
16032 }
16033 }
16034
16035 if (!UseOriginalSrc) {
16036 Src0 = resolveSources(DAG, SL, Src0s, false, true);
16037 Src1 = resolveSources(DAG, SL, Src1s, false, true);
16038 }
16039
16040 assert(IsSigned);
16041 SDValue Src2 =
16042 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16043
16044 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
16045 : Intrinsic::amdgcn_udot4,
16046 SL, MVT::i64);
16047
16048 assert(!VT.isVector());
16049 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
16050 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
16051
16052 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
16053 }
16054
16055 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16056 return SDValue();
16057
16058 // add x, zext (setcc) => uaddo_carry x, 0, setcc
16059 // add x, sext (setcc) => usubo_carry x, 0, setcc
16060 unsigned Opc = LHS.getOpcode();
16063 std::swap(RHS, LHS);
16064
16065 Opc = RHS.getOpcode();
16066 switch (Opc) {
16067 default:
16068 break;
16069 case ISD::ZERO_EXTEND:
16070 case ISD::SIGN_EXTEND:
16071 case ISD::ANY_EXTEND: {
16072 auto Cond = RHS.getOperand(0);
16073 // If this won't be a real VOPC output, we would still need to insert an
16074 // extra instruction anyway.
16075 if (!isBoolSGPR(Cond))
16076 break;
16077 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16078 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16080 return DAG.getNode(Opc, SL, VTList, Args);
16081 }
16082 case ISD::UADDO_CARRY: {
16083 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
16084 if (!isNullConstant(RHS.getOperand(1)))
16085 break;
16086 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
16087 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
16088 }
16089 }
16090 return SDValue();
16091}
16092
16093SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
16094 DAGCombinerInfo &DCI) const {
16095 SelectionDAG &DAG = DCI.DAG;
16096 SDLoc DL(N);
16097 EVT VT = N->getValueType(0);
16098 SDValue N0 = N->getOperand(0);
16099 SDValue N1 = N->getOperand(1);
16100
16101 // The following folds transform PTRADDs into regular arithmetic in cases
16102 // where the PTRADD wouldn't be folded as an immediate offset into memory
16103 // instructions anyway. They are target-specific in that other targets might
16104 // prefer to not lose information about the pointer arithmetic.
16105
16106 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
16107 // Adapted from DAGCombiner::visitADDLikeCommutative.
16108 SDValue V, K;
16109 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
16110 SDNodeFlags ShlFlags = N1->getFlags();
16111 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
16112 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
16113 // preserved.
16114 SDNodeFlags NewShlFlags =
16115 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
16117 : SDNodeFlags();
16118 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
16119 DCI.AddToWorklist(Inner.getNode());
16120 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
16121 }
16122
16123 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
16124 // performAddCombine.
16125 if (N1.getOpcode() == ISD::MUL) {
16126 if (Subtarget->hasMad64_32()) {
16127 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
16128 return Folded;
16129 }
16130 }
16131
16132 // If the 32 low bits of the constant are all zero, there is nothing to fold
16133 // into an immediate offset, so it's better to eliminate the unnecessary
16134 // addition for the lower 32 bits than to preserve the PTRADD.
16135 // Analogous to a fold in performAddCombine.
16136 if (VT == MVT::i64) {
16137 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16138 return Folded;
16139 }
16140
16141 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
16142 return SDValue();
16143
16144 SDValue X = N0;
16145 SDValue Y = N1.getOperand(0);
16146 SDValue Z = N1.getOperand(1);
16147 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
16148 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
16149
16150 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
16151 Y->isDivergent() != Z->isDivergent()) {
16152 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
16153 // y are uniform and z isn't.
16154 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
16155 // z are uniform and y isn't.
16156 // The goal is to push uniform operands up in the computation, so that they
16157 // can be handled with scalar operations. We can't use reassociateScalarOps
16158 // for this since it requires two identical commutative operations to
16159 // reassociate.
16160 if (Y->isDivergent())
16161 std::swap(Y, Z);
16162 // If both additions in the original were NUW, reassociation preserves that.
16163 SDNodeFlags ReassocFlags =
16164 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
16165 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
16166 DCI.AddToWorklist(UniformInner.getNode());
16167 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
16168 }
16169
16170 return SDValue();
16171}
16172
16173SDValue SITargetLowering::performSubCombine(SDNode *N,
16174 DAGCombinerInfo &DCI) const {
16175 SelectionDAG &DAG = DCI.DAG;
16176 EVT VT = N->getValueType(0);
16177
16178 if (VT == MVT::i64) {
16179 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
16180 return Folded;
16181 }
16182
16183 if (VT != MVT::i32)
16184 return SDValue();
16185
16186 SDLoc SL(N);
16187 SDValue LHS = N->getOperand(0);
16188 SDValue RHS = N->getOperand(1);
16189
16190 // sub x, zext (setcc) => usubo_carry x, 0, setcc
16191 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
16192 unsigned Opc = RHS.getOpcode();
16193 switch (Opc) {
16194 default:
16195 break;
16196 case ISD::ZERO_EXTEND:
16197 case ISD::SIGN_EXTEND:
16198 case ISD::ANY_EXTEND: {
16199 auto Cond = RHS.getOperand(0);
16200 // If this won't be a real VOPC output, we would still need to insert an
16201 // extra instruction anyway.
16202 if (!isBoolSGPR(Cond))
16203 break;
16204 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
16205 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
16207 return DAG.getNode(Opc, SL, VTList, Args);
16208 }
16209 }
16210
16211 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
16212 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
16213 if (!isNullConstant(LHS.getOperand(1)))
16214 return SDValue();
16215 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
16216 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
16217 }
16218 return SDValue();
16219}
16220
16221SDValue
16222SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
16223 DAGCombinerInfo &DCI) const {
16224
16225 if (N->getValueType(0) != MVT::i32)
16226 return SDValue();
16227
16228 if (!isNullConstant(N->getOperand(1)))
16229 return SDValue();
16230
16231 SelectionDAG &DAG = DCI.DAG;
16232 SDValue LHS = N->getOperand(0);
16233
16234 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
16235 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
16236 unsigned LHSOpc = LHS.getOpcode();
16237 unsigned Opc = N->getOpcode();
16238 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
16239 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
16240 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
16241 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
16242 }
16243 return SDValue();
16244}
16245
16246SDValue SITargetLowering::performFAddCombine(SDNode *N,
16247 DAGCombinerInfo &DCI) const {
16248 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16249 return SDValue();
16250
16251 SelectionDAG &DAG = DCI.DAG;
16252 EVT VT = N->getValueType(0);
16253
16254 SDLoc SL(N);
16255 SDValue LHS = N->getOperand(0);
16256 SDValue RHS = N->getOperand(1);
16257
16258 // These should really be instruction patterns, but writing patterns with
16259 // source modifiers is a pain.
16260
16261 // fadd (fadd (a, a), b) -> mad 2.0, a, b
16262 if (LHS.getOpcode() == ISD::FADD) {
16263 SDValue A = LHS.getOperand(0);
16264 if (A == LHS.getOperand(1)) {
16265 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16266 if (FusedOp != 0) {
16267 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16268 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
16269 }
16270 }
16271 }
16272
16273 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
16274 if (RHS.getOpcode() == ISD::FADD) {
16275 SDValue A = RHS.getOperand(0);
16276 if (A == RHS.getOperand(1)) {
16277 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16278 if (FusedOp != 0) {
16279 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16280 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
16281 }
16282 }
16283 }
16284
16285 return SDValue();
16286}
16287
16288SDValue SITargetLowering::performFSubCombine(SDNode *N,
16289 DAGCombinerInfo &DCI) const {
16290 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
16291 return SDValue();
16292
16293 SelectionDAG &DAG = DCI.DAG;
16294 SDLoc SL(N);
16295 EVT VT = N->getValueType(0);
16296 assert(!VT.isVector());
16297
16298 // Try to get the fneg to fold into the source modifier. This undoes generic
16299 // DAG combines and folds them into the mad.
16300 //
16301 // Only do this if we are not trying to support denormals. v_mad_f32 does
16302 // not support denormals ever.
16303 SDValue LHS = N->getOperand(0);
16304 SDValue RHS = N->getOperand(1);
16305 if (LHS.getOpcode() == ISD::FADD) {
16306 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
16307 SDValue A = LHS.getOperand(0);
16308 if (A == LHS.getOperand(1)) {
16309 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
16310 if (FusedOp != 0) {
16311 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
16312 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
16313
16314 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
16315 }
16316 }
16317 }
16318
16319 if (RHS.getOpcode() == ISD::FADD) {
16320 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
16321
16322 SDValue A = RHS.getOperand(0);
16323 if (A == RHS.getOperand(1)) {
16324 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
16325 if (FusedOp != 0) {
16326 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
16327 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
16328 }
16329 }
16330 }
16331
16332 return SDValue();
16333}
16334
16335SDValue SITargetLowering::performFDivCombine(SDNode *N,
16336 DAGCombinerInfo &DCI) const {
16337 SelectionDAG &DAG = DCI.DAG;
16338 SDLoc SL(N);
16339 EVT VT = N->getValueType(0);
16340 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16341 return SDValue();
16342
16343 SDValue LHS = N->getOperand(0);
16344 SDValue RHS = N->getOperand(1);
16345
16346 SDNodeFlags Flags = N->getFlags();
16347 SDNodeFlags RHSFlags = RHS->getFlags();
16348 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
16349 !RHS->hasOneUse())
16350 return SDValue();
16351
16352 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
16353 bool IsNegative = false;
16354 if (CLHS->isExactlyValue(1.0) ||
16355 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16356 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
16357 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
16358 if (RHS.getOpcode() == ISD::FSQRT) {
16359 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
16360 SDValue Rsq =
16361 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
16362 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16363 }
16364 }
16365 }
16366
16367 return SDValue();
16368}
16369
16370SDValue SITargetLowering::performFMulCombine(SDNode *N,
16371 DAGCombinerInfo &DCI) const {
16372 SelectionDAG &DAG = DCI.DAG;
16373 EVT VT = N->getValueType(0);
16374 EVT ScalarVT = VT.getScalarType();
16375 EVT IntVT = VT.changeElementType(MVT::i32);
16376
16377 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
16378 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16379 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
16380 return SDValue();
16381 }
16382
16383 SDValue LHS = N->getOperand(0);
16384 SDValue RHS = N->getOperand(1);
16385
16386 // It is cheaper to realize i32 inline constants as compared against
16387 // materializing f16 or f64 (or even non-inline f32) values,
16388 // possible via ldexp usage, as shown below :
16389 //
16390 // Given : A = 2^a & B = 2^b ; where a and b are integers.
16391 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
16392 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
16393 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16394 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
16395 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
16396 if (!TrueNode)
16397 return SDValue();
16398 const ConstantFPSDNode *FalseNode =
16399 isConstOrConstSplatFP(RHS.getOperand(2));
16400 if (!FalseNode)
16401 return SDValue();
16402
16403 if (TrueNode->isNegative() != FalseNode->isNegative())
16404 return SDValue();
16405
16406 // For f32, only non-inline constants should be transformed.
16407 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16408 if (ScalarVT == MVT::f32 &&
16409 TII->isInlineConstant(TrueNode->getValueAPF()) &&
16410 TII->isInlineConstant(FalseNode->getValueAPF()))
16411 return SDValue();
16412
16413 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
16414 if (TrueNodeExpVal == INT_MIN)
16415 return SDValue();
16416 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
16417 if (FalseNodeExpVal == INT_MIN)
16418 return SDValue();
16419
16420 SDLoc SL(N);
16421 SDValue SelectNode =
16422 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
16423 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
16424 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
16425
16426 LHS = TrueNode->isNegative()
16427 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
16428 : LHS;
16429
16430 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
16431 }
16432
16433 return SDValue();
16434}
16435
16436SDValue SITargetLowering::performFMACombine(SDNode *N,
16437 DAGCombinerInfo &DCI) const {
16438 SelectionDAG &DAG = DCI.DAG;
16439 EVT VT = N->getValueType(0);
16440 SDLoc SL(N);
16441
16442 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16443 return SDValue();
16444
16445 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
16446 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
16447 SDValue Op1 = N->getOperand(0);
16448 SDValue Op2 = N->getOperand(1);
16449 SDValue FMA = N->getOperand(2);
16450
16451 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
16452 Op2.getOpcode() != ISD::FP_EXTEND)
16453 return SDValue();
16454
16455 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
16456 // regardless of the denorm mode setting. Therefore,
16457 // fp-contract is sufficient to allow generating fdot2.
16458 const TargetOptions &Options = DAG.getTarget().Options;
16459 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16460 (N->getFlags().hasAllowContract() &&
16461 FMA->getFlags().hasAllowContract())) {
16462 Op1 = Op1.getOperand(0);
16463 Op2 = Op2.getOperand(0);
16464 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16466 return SDValue();
16467
16468 SDValue Vec1 = Op1.getOperand(0);
16469 SDValue Idx1 = Op1.getOperand(1);
16470 SDValue Vec2 = Op2.getOperand(0);
16471
16472 SDValue FMAOp1 = FMA.getOperand(0);
16473 SDValue FMAOp2 = FMA.getOperand(1);
16474 SDValue FMAAcc = FMA.getOperand(2);
16475
16476 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
16477 FMAOp2.getOpcode() != ISD::FP_EXTEND)
16478 return SDValue();
16479
16480 FMAOp1 = FMAOp1.getOperand(0);
16481 FMAOp2 = FMAOp2.getOperand(0);
16482 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16484 return SDValue();
16485
16486 SDValue Vec3 = FMAOp1.getOperand(0);
16487 SDValue Vec4 = FMAOp2.getOperand(0);
16488 SDValue Idx2 = FMAOp1.getOperand(1);
16489
16490 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
16491 // Idx1 and Idx2 cannot be the same.
16492 Idx1 == Idx2)
16493 return SDValue();
16494
16495 if (Vec1 == Vec2 || Vec3 == Vec4)
16496 return SDValue();
16497
16498 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
16499 return SDValue();
16500
16501 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16502 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16503 DAG.getTargetConstant(0, SL, MVT::i1));
16504 }
16505 }
16506 return SDValue();
16507}
16508
16509SDValue SITargetLowering::performSetCCCombine(SDNode *N,
16510 DAGCombinerInfo &DCI) const {
16511 SelectionDAG &DAG = DCI.DAG;
16512 SDLoc SL(N);
16513
16514 SDValue LHS = N->getOperand(0);
16515 SDValue RHS = N->getOperand(1);
16516 EVT VT = LHS.getValueType();
16517 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
16518
16519 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16520 if (!CRHS) {
16522 if (CRHS) {
16523 std::swap(LHS, RHS);
16524 CC = getSetCCSwappedOperands(CC);
16525 }
16526 }
16527
16528 if (CRHS) {
16529 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
16530 isBoolSGPR(LHS.getOperand(0))) {
16531 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
16532 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
16533 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
16534 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
16535 if ((CRHS->isAllOnes() &&
16536 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
16537 (CRHS->isZero() &&
16538 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
16539 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16540 DAG.getAllOnesConstant(SL, MVT::i1));
16541 if ((CRHS->isAllOnes() &&
16542 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
16543 (CRHS->isZero() &&
16544 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
16545 return LHS.getOperand(0);
16546 }
16547
16548 const APInt &CRHSVal = CRHS->getAPIntValue();
16549 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
16550 LHS.getOpcode() == ISD::SELECT &&
16551 isa<ConstantSDNode>(LHS.getOperand(1)) &&
16552 isa<ConstantSDNode>(LHS.getOperand(2)) &&
16553 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
16554 isBoolSGPR(LHS.getOperand(0))) {
16555 // Given CT != FT:
16556 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
16557 // setcc (select cc, CT, CF), CF, ne => cc
16558 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
16559 // setcc (select cc, CT, CF), CT, eq => cc
16560 const APInt &CT = LHS.getConstantOperandAPInt(1);
16561 const APInt &CF = LHS.getConstantOperandAPInt(2);
16562
16563 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
16564 (CT == CRHSVal && CC == ISD::SETNE))
16565 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
16566 DAG.getAllOnesConstant(SL, MVT::i1));
16567 if ((CF == CRHSVal && CC == ISD::SETNE) ||
16568 (CT == CRHSVal && CC == ISD::SETEQ))
16569 return LHS.getOperand(0);
16570 }
16571 }
16572
16573 if (VT != MVT::f32 && VT != MVT::f64 &&
16574 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16575 return SDValue();
16576
16577 // Match isinf/isfinite pattern
16578 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
16579 // (fcmp one (fabs x), inf) -> (fp_class x,
16580 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
16581 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
16582 LHS.getOpcode() == ISD::FABS) {
16583 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
16584 if (!CRHS)
16585 return SDValue();
16586
16587 const APFloat &APF = CRHS->getValueAPF();
16588 if (APF.isInfinity() && !APF.isNegative()) {
16589 const unsigned IsInfMask =
16591 const unsigned IsFiniteMask =
16595 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
16596 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
16597 DAG.getConstant(Mask, SL, MVT::i32));
16598 }
16599 }
16600
16601 return SDValue();
16602}
16603
16604SDValue
16605SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
16606 DAGCombinerInfo &DCI) const {
16607 SelectionDAG &DAG = DCI.DAG;
16608 SDLoc SL(N);
16609 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16610
16611 SDValue Src = N->getOperand(0);
16612 SDValue Shift = N->getOperand(0);
16613
16614 // TODO: Extend type shouldn't matter (assuming legal types).
16615 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
16616 Shift = Shift.getOperand(0);
16617
16618 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
16619 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
16620 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
16621 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
16622 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
16623 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
16624 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
16625 SDValue Shifted = DAG.getZExtOrTrunc(
16626 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
16627
16628 unsigned ShiftOffset = 8 * Offset;
16629 if (Shift.getOpcode() == ISD::SHL)
16630 ShiftOffset -= C->getZExtValue();
16631 else
16632 ShiftOffset += C->getZExtValue();
16633
16634 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16635 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16636 MVT::f32, Shifted);
16637 }
16638 }
16639 }
16640
16641 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16642 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
16643 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
16644 // We simplified Src. If this node is not dead, visit it again so it is
16645 // folded properly.
16646 if (N->getOpcode() != ISD::DELETED_NODE)
16647 DCI.AddToWorklist(N);
16648 return SDValue(N, 0);
16649 }
16650
16651 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
16652 if (SDValue DemandedSrc =
16653 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
16654 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
16655
16656 return SDValue();
16657}
16658
16659SDValue SITargetLowering::performClampCombine(SDNode *N,
16660 DAGCombinerInfo &DCI) const {
16661 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
16662 if (!CSrc)
16663 return SDValue();
16664
16665 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16666 const APFloat &F = CSrc->getValueAPF();
16667 APFloat Zero = APFloat::getZero(F.getSemantics());
16668 if (F < Zero ||
16669 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16670 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
16671 }
16672
16673 APFloat One(F.getSemantics(), "1.0");
16674 if (F > One)
16675 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
16676
16677 return SDValue(CSrc, 0);
16678}
16679
16680SDValue SITargetLowering::performSelectCombine(SDNode *N,
16681 DAGCombinerInfo &DCI) const {
16682
16683 // Try to fold CMP + SELECT patterns with shared constants (both FP and
16684 // integer).
16685 // Detect when CMP and SELECT use the same constant and fold them to avoid
16686 // loading the constant twice. Specifically handles patterns like:
16687 // %cmp = icmp eq i32 %val, 4242
16688 // %sel = select i1 %cmp, i32 4242, i32 %other
16689 // It can be optimized to reuse %val instead of 4242 in select.
16690 SDValue Cond = N->getOperand(0);
16691 SDValue TrueVal = N->getOperand(1);
16692 SDValue FalseVal = N->getOperand(2);
16693
16694 // Check if condition is a comparison.
16695 if (Cond.getOpcode() != ISD::SETCC)
16696 return SDValue();
16697
16698 SDValue LHS = Cond.getOperand(0);
16699 SDValue RHS = Cond.getOperand(1);
16700 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
16701
16702 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
16703 bool isInteger = LHS.getValueType().isInteger();
16704
16705 // Handle simple floating-point and integer types only.
16706 if (!isFloatingPoint && !isInteger)
16707 return SDValue();
16708
16709 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
16710 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
16711 if (!isEquality && !isNonEquality)
16712 return SDValue();
16713
16714 SDValue ArgVal, ConstVal;
16715 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
16716 (isInteger && isa<ConstantSDNode>(RHS))) {
16717 ConstVal = RHS;
16718 ArgVal = LHS;
16719 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
16720 (isInteger && isa<ConstantSDNode>(LHS))) {
16721 ConstVal = LHS;
16722 ArgVal = RHS;
16723 } else {
16724 return SDValue();
16725 }
16726
16727 // Skip optimization for inlinable immediates.
16728 if (isFloatingPoint) {
16729 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
16730 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16731 return SDValue();
16732 } else {
16734 cast<ConstantSDNode>(ConstVal)->getSExtValue()))
16735 return SDValue();
16736 }
16737
16738 // For equality and non-equality comparisons, patterns:
16739 // select (setcc x, const), const, y -> select (setcc x, const), x, y
16740 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
16741 if (!(isEquality && TrueVal == ConstVal) &&
16742 !(isNonEquality && FalseVal == ConstVal))
16743 return SDValue();
16744
16745 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
16746 SDValue SelectRHS =
16747 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
16748 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
16749 SelectLHS, SelectRHS);
16750}
16751
16753 DAGCombinerInfo &DCI) const {
16754 switch (N->getOpcode()) {
16755 case ISD::ADD:
16756 case ISD::SUB:
16757 case ISD::SHL:
16758 case ISD::SRL:
16759 case ISD::SRA:
16760 case ISD::AND:
16761 case ISD::OR:
16762 case ISD::XOR:
16763 case ISD::MUL:
16764 case ISD::SETCC:
16765 case ISD::SELECT:
16766 case ISD::SMIN:
16767 case ISD::SMAX:
16768 case ISD::UMIN:
16769 case ISD::UMAX:
16770 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
16771 return Res;
16772 break;
16773 default:
16774 break;
16775 }
16776
16777 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
16778 return SDValue();
16779
16780 switch (N->getOpcode()) {
16781 case ISD::ADD:
16782 return performAddCombine(N, DCI);
16783 case ISD::PTRADD:
16784 return performPtrAddCombine(N, DCI);
16785 case ISD::SUB:
16786 return performSubCombine(N, DCI);
16787 case ISD::UADDO_CARRY:
16788 case ISD::USUBO_CARRY:
16789 return performAddCarrySubCarryCombine(N, DCI);
16790 case ISD::FADD:
16791 return performFAddCombine(N, DCI);
16792 case ISD::FSUB:
16793 return performFSubCombine(N, DCI);
16794 case ISD::FDIV:
16795 return performFDivCombine(N, DCI);
16796 case ISD::FMUL:
16797 return performFMulCombine(N, DCI);
16798 case ISD::SETCC:
16799 return performSetCCCombine(N, DCI);
16800 case ISD::SELECT:
16801 if (auto Res = performSelectCombine(N, DCI))
16802 return Res;
16803 break;
16804 case ISD::FMAXNUM:
16805 case ISD::FMINNUM:
16806 case ISD::FMAXNUM_IEEE:
16807 case ISD::FMINNUM_IEEE:
16808 case ISD::FMAXIMUM:
16809 case ISD::FMINIMUM:
16810 case ISD::FMAXIMUMNUM:
16811 case ISD::FMINIMUMNUM:
16812 case ISD::SMAX:
16813 case ISD::SMIN:
16814 case ISD::UMAX:
16815 case ISD::UMIN:
16818 return performMinMaxCombine(N, DCI);
16819 case ISD::FMA:
16820 return performFMACombine(N, DCI);
16821 case ISD::AND:
16822 return performAndCombine(N, DCI);
16823 case ISD::OR:
16824 return performOrCombine(N, DCI);
16825 case ISD::FSHR: {
16827 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
16828 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16829 return matchPERM(N, DCI);
16830 }
16831 break;
16832 }
16833 case ISD::XOR:
16834 return performXorCombine(N, DCI);
16835 case ISD::ZERO_EXTEND:
16836 return performZeroExtendCombine(N, DCI);
16838 return performSignExtendInRegCombine(N, DCI);
16840 return performClassCombine(N, DCI);
16841 case ISD::FCANONICALIZE:
16842 return performFCanonicalizeCombine(N, DCI);
16843 case AMDGPUISD::RCP:
16844 return performRcpCombine(N, DCI);
16845 case ISD::FLDEXP:
16846 case AMDGPUISD::FRACT:
16847 case AMDGPUISD::RSQ:
16850 case AMDGPUISD::RSQ_CLAMP: {
16851 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
16852 SDValue Src = N->getOperand(0);
16853 if (Src.isUndef())
16854 return Src;
16855 break;
16856 }
16857 case ISD::SINT_TO_FP:
16858 case ISD::UINT_TO_FP:
16859 return performUCharToFloatCombine(N, DCI);
16860 case ISD::FCOPYSIGN:
16861 return performFCopySignCombine(N, DCI);
16866 return performCvtF32UByteNCombine(N, DCI);
16867 case AMDGPUISD::FMED3:
16868 return performFMed3Combine(N, DCI);
16870 return performCvtPkRTZCombine(N, DCI);
16871 case AMDGPUISD::CLAMP:
16872 return performClampCombine(N, DCI);
16873 case ISD::SCALAR_TO_VECTOR: {
16874 SelectionDAG &DAG = DCI.DAG;
16875 EVT VT = N->getValueType(0);
16876
16877 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
16878 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16879 SDLoc SL(N);
16880 SDValue Src = N->getOperand(0);
16881 EVT EltVT = Src.getValueType();
16882 if (EltVT != MVT::i16)
16883 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
16884
16885 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
16886 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
16887 }
16888
16889 break;
16890 }
16892 return performExtractVectorEltCombine(N, DCI);
16894 return performInsertVectorEltCombine(N, DCI);
16895 case ISD::FP_ROUND:
16896 return performFPRoundCombine(N, DCI);
16897 case ISD::LOAD: {
16898 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
16899 return Widened;
16900 [[fallthrough]];
16901 }
16902 default: {
16903 if (!DCI.isBeforeLegalize()) {
16904 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
16905 return performMemSDNodeCombine(MemNode, DCI);
16906 }
16907
16908 break;
16909 }
16910 }
16911
16913}
16914
16915/// Helper function for adjustWritemask
16916static unsigned SubIdx2Lane(unsigned Idx) {
16917 switch (Idx) {
16918 default:
16919 return ~0u;
16920 case AMDGPU::sub0:
16921 return 0;
16922 case AMDGPU::sub1:
16923 return 1;
16924 case AMDGPU::sub2:
16925 return 2;
16926 case AMDGPU::sub3:
16927 return 3;
16928 case AMDGPU::sub4:
16929 return 4; // Possible with TFE/LWE
16930 }
16931}
16932
16933/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
16934SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
16935 SelectionDAG &DAG) const {
16936 unsigned Opcode = Node->getMachineOpcode();
16937
16938 // Subtract 1 because the vdata output is not a MachineSDNode operand.
16939 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
16940 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
16941 return Node; // not implemented for D16
16942
16943 SDNode *Users[5] = {nullptr};
16944 unsigned Lane = 0;
16945 unsigned DmaskIdx =
16946 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
16947 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
16948 unsigned NewDmask = 0;
16949 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
16950 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
16951 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
16952 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
16953 unsigned TFCLane = 0;
16954 bool HasChain = Node->getNumValues() > 1;
16955
16956 if (OldDmask == 0) {
16957 // These are folded out, but on the chance it happens don't assert.
16958 return Node;
16959 }
16960
16961 unsigned OldBitsSet = llvm::popcount(OldDmask);
16962 // Work out which is the TFE/LWE lane if that is enabled.
16963 if (UsesTFC) {
16964 TFCLane = OldBitsSet;
16965 }
16966
16967 // Try to figure out the used register components
16968 for (SDUse &Use : Node->uses()) {
16969
16970 // Don't look at users of the chain.
16971 if (Use.getResNo() != 0)
16972 continue;
16973
16974 SDNode *User = Use.getUser();
16975
16976 // Abort if we can't understand the usage
16977 if (!User->isMachineOpcode() ||
16978 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
16979 return Node;
16980
16981 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
16982 // Note that subregs are packed, i.e. Lane==0 is the first bit set
16983 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
16984 // set, etc.
16985 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
16986 if (Lane == ~0u)
16987 return Node;
16988
16989 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
16990 if (UsesTFC && Lane == TFCLane) {
16991 Users[Lane] = User;
16992 } else {
16993 // Set which texture component corresponds to the lane.
16994 unsigned Comp;
16995 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
16996 Comp = llvm::countr_zero(Dmask);
16997 Dmask &= ~(1 << Comp);
16998 }
16999
17000 // Abort if we have more than one user per component.
17001 if (Users[Lane])
17002 return Node;
17003
17004 Users[Lane] = User;
17005 NewDmask |= 1 << Comp;
17006 }
17007 }
17008
17009 // Don't allow 0 dmask, as hardware assumes one channel enabled.
17010 bool NoChannels = !NewDmask;
17011 if (NoChannels) {
17012 if (!UsesTFC) {
17013 // No uses of the result and not using TFC. Then do nothing.
17014 return Node;
17015 }
17016 // If the original dmask has one channel - then nothing to do
17017 if (OldBitsSet == 1)
17018 return Node;
17019 // Use an arbitrary dmask - required for the instruction to work
17020 NewDmask = 1;
17021 }
17022 // Abort if there's no change
17023 if (NewDmask == OldDmask)
17024 return Node;
17025
17026 unsigned BitsSet = llvm::popcount(NewDmask);
17027
17028 // Check for TFE or LWE - increase the number of channels by one to account
17029 // for the extra return value
17030 // This will need adjustment for D16 if this is also included in
17031 // adjustWriteMask (this function) but at present D16 are excluded.
17032 unsigned NewChannels = BitsSet + UsesTFC;
17033
17034 int NewOpcode =
17035 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
17036 assert(NewOpcode != -1 &&
17037 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
17038 "failed to find equivalent MIMG op");
17039
17040 // Adjust the writemask in the node
17042 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
17043 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
17044 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
17045
17046 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
17047
17048 MVT ResultVT = NewChannels == 1
17049 ? SVT
17050 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
17051 : NewChannels == 5 ? 8
17052 : NewChannels);
17053 SDVTList NewVTList =
17054 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
17055
17056 MachineSDNode *NewNode =
17057 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
17058
17059 if (HasChain) {
17060 // Update chain.
17061 DAG.setNodeMemRefs(NewNode, Node->memoperands());
17062 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
17063 }
17064
17065 if (NewChannels == 1) {
17066 assert(Node->hasNUsesOfValue(1, 0));
17067 SDNode *Copy =
17068 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
17069 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
17070 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
17071 return nullptr;
17072 }
17073
17074 // Update the users of the node with the new indices
17075 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17076 SDNode *User = Users[i];
17077 if (!User) {
17078 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
17079 // Users[0] is still nullptr because channel 0 doesn't really have a use.
17080 if (i || !NoChannels)
17081 continue;
17082 } else {
17083 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
17084 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
17085 if (NewUser != User) {
17086 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
17087 DAG.RemoveDeadNode(User);
17088 }
17089 }
17090
17091 switch (Idx) {
17092 default:
17093 break;
17094 case AMDGPU::sub0:
17095 Idx = AMDGPU::sub1;
17096 break;
17097 case AMDGPU::sub1:
17098 Idx = AMDGPU::sub2;
17099 break;
17100 case AMDGPU::sub2:
17101 Idx = AMDGPU::sub3;
17102 break;
17103 case AMDGPU::sub3:
17104 Idx = AMDGPU::sub4;
17105 break;
17106 }
17107 }
17108
17109 DAG.RemoveDeadNode(Node);
17110 return nullptr;
17111}
17112
17114 if (Op.getOpcode() == ISD::AssertZext)
17115 Op = Op.getOperand(0);
17116
17117 return isa<FrameIndexSDNode>(Op);
17118}
17119
17120/// Legalize target independent instructions (e.g. INSERT_SUBREG)
17121/// with frame index operands.
17122/// LLVM assumes that inputs are to these instructions are registers.
17123SDNode *
17125 SelectionDAG &DAG) const {
17126 if (Node->getOpcode() == ISD::CopyToReg) {
17127 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
17128 SDValue SrcVal = Node->getOperand(2);
17129
17130 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
17131 // to try understanding copies to physical registers.
17132 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
17133 SDLoc SL(Node);
17135 SDValue VReg = DAG.getRegister(
17136 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17137
17138 SDNode *Glued = Node->getGluedNode();
17139 SDValue ToVReg = DAG.getCopyToReg(
17140 Node->getOperand(0), SL, VReg, SrcVal,
17141 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
17142 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
17143 VReg, ToVReg.getValue(1));
17144 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
17145 DAG.RemoveDeadNode(Node);
17146 return ToResultReg.getNode();
17147 }
17148 }
17149
17151 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
17152 if (!isFrameIndexOp(Node->getOperand(i))) {
17153 Ops.push_back(Node->getOperand(i));
17154 continue;
17155 }
17156
17157 SDLoc DL(Node);
17158 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
17159 Node->getOperand(i).getValueType(),
17160 Node->getOperand(i)),
17161 0));
17162 }
17163
17164 return DAG.UpdateNodeOperands(Node, Ops);
17165}
17166
17167/// Fold the instructions after selecting them.
17168/// Returns null if users were already updated.
17170 SelectionDAG &DAG) const {
17172 unsigned Opcode = Node->getMachineOpcode();
17173
17174 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
17175 !TII->isGather4(Opcode) &&
17176 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
17177 return adjustWritemask(Node, DAG);
17178 }
17179
17180 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17182 return Node;
17183 }
17184
17185 switch (Opcode) {
17186 case AMDGPU::V_DIV_SCALE_F32_e64:
17187 case AMDGPU::V_DIV_SCALE_F64_e64: {
17188 // Satisfy the operand register constraint when one of the inputs is
17189 // undefined. Ordinarily each undef value will have its own implicit_def of
17190 // a vreg, so force these to use a single register.
17191 SDValue Src0 = Node->getOperand(1);
17192 SDValue Src1 = Node->getOperand(3);
17193 SDValue Src2 = Node->getOperand(5);
17194
17195 if ((Src0.isMachineOpcode() &&
17196 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
17197 (Src0 == Src1 || Src0 == Src2))
17198 break;
17199
17200 MVT VT = Src0.getValueType().getSimpleVT();
17201 const TargetRegisterClass *RC =
17202 getRegClassFor(VT, Src0.getNode()->isDivergent());
17203
17205 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
17206
17207 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
17208 Src0, SDValue());
17209
17210 // src0 must be the same register as src1 or src2, even if the value is
17211 // undefined, so make sure we don't violate this constraint.
17212 if (Src0.isMachineOpcode() &&
17213 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
17214 if (Src1.isMachineOpcode() &&
17215 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17216 Src0 = Src1;
17217 else if (Src2.isMachineOpcode() &&
17218 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
17219 Src0 = Src2;
17220 else {
17221 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
17222 Src0 = UndefReg;
17223 Src1 = UndefReg;
17224 }
17225 } else
17226 break;
17227
17229 Ops[1] = Src0;
17230 Ops[3] = Src1;
17231 Ops[5] = Src2;
17232 Ops.push_back(ImpDef.getValue(1));
17233 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
17234 }
17235 default:
17236 break;
17237 }
17238
17239 return Node;
17240}
17241
17242// Any MIMG instructions that use tfe or lwe require an initialization of the
17243// result register that will be written in the case of a memory access failure.
17244// The required code is also added to tie this init code to the result of the
17245// img instruction.
17248 const SIRegisterInfo &TRI = TII->getRegisterInfo();
17249 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
17250 MachineBasicBlock &MBB = *MI.getParent();
17251
17252 int DstIdx =
17253 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
17254 unsigned InitIdx = 0;
17255
17256 if (TII->isImage(MI)) {
17257 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
17258 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
17259 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
17260
17261 if (!TFE && !LWE) // intersect_ray
17262 return;
17263
17264 unsigned TFEVal = TFE ? TFE->getImm() : 0;
17265 unsigned LWEVal = LWE ? LWE->getImm() : 0;
17266 unsigned D16Val = D16 ? D16->getImm() : 0;
17267
17268 if (!TFEVal && !LWEVal)
17269 return;
17270
17271 // At least one of TFE or LWE are non-zero
17272 // We have to insert a suitable initialization of the result value and
17273 // tie this to the dest of the image instruction.
17274
17275 // Calculate which dword we have to initialize to 0.
17276 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
17277
17278 // check that dmask operand is found.
17279 assert(MO_Dmask && "Expected dmask operand in instruction");
17280
17281 unsigned dmask = MO_Dmask->getImm();
17282 // Determine the number of active lanes taking into account the
17283 // Gather4 special case
17284 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
17285
17286 bool Packed = !Subtarget->hasUnpackedD16VMem();
17287
17288 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17289
17290 // Abandon attempt if the dst size isn't large enough
17291 // - this is in fact an error but this is picked up elsewhere and
17292 // reported correctly.
17293 uint32_t DstSize =
17294 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17295 if (DstSize < InitIdx)
17296 return;
17297 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
17298 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
17299 } else {
17300 return;
17301 }
17302
17303 const DebugLoc &DL = MI.getDebugLoc();
17304
17305 // Create a register for the initialization value.
17306 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
17307 unsigned NewDst = 0; // Final initialized value will be in here
17308
17309 // If PRTStrictNull feature is enabled (the default) then initialize
17310 // all the result registers to 0, otherwise just the error indication
17311 // register (VGPRn+1)
17312 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17313 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17314
17315 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
17316 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17317 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
17318 // Initialize dword
17319 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
17320 // clang-format off
17321 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
17322 .addImm(0);
17323 // clang-format on
17324 // Insert into the super-reg
17325 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
17326 .addReg(PrevDst)
17327 .addReg(SubReg)
17329
17330 PrevDst = NewDst;
17331 }
17332
17333 // Add as an implicit operand
17334 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
17335
17336 // Tie the just added implicit operand to the dst
17337 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
17338}
17339
17340/// Assign the register class depending on the number of
17341/// bits set in the writemask
17343 SDNode *Node) const {
17345
17346 MachineFunction *MF = MI.getParent()->getParent();
17349
17350 if (TII->isVOP3(MI.getOpcode())) {
17351 // Make sure constant bus requirements are respected.
17352 TII->legalizeOperandsVOP3(MRI, MI);
17353
17354 // Prefer VGPRs over AGPRs in mAI instructions where possible.
17355 // This saves a chain-copy of registers and better balance register
17356 // use between vgpr and agpr as agpr tuples tend to be big.
17357 if (!MI.getDesc().operands().empty()) {
17358 unsigned Opc = MI.getOpcode();
17359 bool HasAGPRs = Info->mayNeedAGPRs();
17360 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17361 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
17362 for (auto I :
17363 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
17364 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
17365 if (I == -1)
17366 break;
17367 if ((I == Src2Idx) && (HasAGPRs))
17368 break;
17369 MachineOperand &Op = MI.getOperand(I);
17370 if (!Op.isReg() || !Op.getReg().isVirtual())
17371 continue;
17372 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
17373 if (!TRI->hasAGPRs(RC))
17374 continue;
17375 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
17376 if (!Src || !Src->isCopy() ||
17377 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
17378 continue;
17379 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
17380 // All uses of agpr64 and agpr32 can also accept vgpr except for
17381 // v_accvgpr_read, but we do not produce agpr reads during selection,
17382 // so no use checks are needed.
17383 MRI.setRegClass(Op.getReg(), NewRC);
17384 }
17385
17386 if (TII->isMAI(MI)) {
17387 // The ordinary src0, src1, src2 were legalized above.
17388 //
17389 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
17390 // as a separate instruction.
17391 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17392 AMDGPU::OpName::scale_src0);
17393 if (Src0Idx != -1) {
17394 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
17395 AMDGPU::OpName::scale_src1);
17396 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
17397 TII->usesConstantBus(MRI, MI, Src1Idx))
17398 TII->legalizeOpWithMove(MI, Src1Idx);
17399 }
17400 }
17401
17402 if (!HasAGPRs)
17403 return;
17404
17405 // Resolve the rest of AV operands to AGPRs.
17406 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
17407 if (Src2->isReg() && Src2->getReg().isVirtual()) {
17408 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
17409 if (TRI->isVectorSuperClass(RC)) {
17410 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
17411 MRI.setRegClass(Src2->getReg(), NewRC);
17412 if (Src2->isTied())
17413 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
17414 }
17415 }
17416 }
17417 }
17418
17419 return;
17420 }
17421
17422 if (TII->isImage(MI))
17423 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
17424}
17425
17427 uint64_t Val) {
17428 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
17429 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
17430}
17431
17433 const SDLoc &DL,
17434 SDValue Ptr) const {
17436
17437 // Build the half of the subregister with the constants before building the
17438 // full 128-bit register. If we are building multiple resource descriptors,
17439 // this will allow CSEing of the 2-component register.
17440 const SDValue Ops0[] = {
17441 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
17442 buildSMovImm32(DAG, DL, 0),
17443 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17444 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
17445 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
17446
17447 SDValue SubRegHi = SDValue(
17448 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
17449
17450 // Combine the constants and the pointer.
17451 const SDValue Ops1[] = {
17452 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
17453 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
17454 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
17455
17456 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
17457}
17458
17459/// Return a resource descriptor with the 'Add TID' bit enabled
17460/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
17461/// of the resource descriptor) to create an offset, which is added to
17462/// the resource pointer.
17464 SDValue Ptr, uint32_t RsrcDword1,
17465 uint64_t RsrcDword2And3) const {
17466 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
17467 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
17468 if (RsrcDword1) {
17469 PtrHi =
17470 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
17471 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
17472 0);
17473 }
17474
17475 SDValue DataLo =
17476 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
17477 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
17478
17479 const SDValue Ops[] = {
17480 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
17481 PtrLo,
17482 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
17483 PtrHi,
17484 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
17485 DataLo,
17486 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
17487 DataHi,
17488 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
17489
17490 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
17491}
17492
17493//===----------------------------------------------------------------------===//
17494// SI Inline Assembly Support
17495//===----------------------------------------------------------------------===//
17496
17497std::pair<unsigned, const TargetRegisterClass *>
17499 StringRef Constraint,
17500 MVT VT) const {
17501 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
17502
17503 const TargetRegisterClass *RC = nullptr;
17504 if (Constraint.size() == 1) {
17505 // Check if we cannot determine the bit size of the given value type. This
17506 // can happen, for example, in this situation where we have an empty struct
17507 // (size 0): `call void asm "", "v"({} poison)`-
17508 if (VT == MVT::Other)
17509 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17510 const unsigned BitWidth = VT.getSizeInBits();
17511 switch (Constraint[0]) {
17512 default:
17513 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17514 case 's':
17515 case 'r':
17516 switch (BitWidth) {
17517 case 16:
17518 RC = &AMDGPU::SReg_32RegClass;
17519 break;
17520 case 64:
17521 RC = &AMDGPU::SGPR_64RegClass;
17522 break;
17523 default:
17525 if (!RC)
17526 return std::pair(0U, nullptr);
17527 break;
17528 }
17529 break;
17530 case 'v':
17531 switch (BitWidth) {
17532 case 16:
17533 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17534 : &AMDGPU::VGPR_32_Lo256RegClass;
17535 break;
17536 default:
17537 RC = Subtarget->has1024AddressableVGPRs()
17538 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
17539 : TRI->getVGPRClassForBitWidth(BitWidth);
17540 if (!RC)
17541 return std::pair(0U, nullptr);
17542 break;
17543 }
17544 break;
17545 case 'a':
17546 if (!Subtarget->hasMAIInsts())
17547 break;
17548 switch (BitWidth) {
17549 case 16:
17550 RC = &AMDGPU::AGPR_32RegClass;
17551 break;
17552 default:
17553 RC = TRI->getAGPRClassForBitWidth(BitWidth);
17554 if (!RC)
17555 return std::pair(0U, nullptr);
17556 break;
17557 }
17558 break;
17559 }
17560 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
17561 const unsigned BitWidth = VT.getSizeInBits();
17562 switch (BitWidth) {
17563 case 16:
17564 RC = &AMDGPU::AV_32RegClass;
17565 break;
17566 default:
17567 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
17568 if (!RC)
17569 return std::pair(0U, nullptr);
17570 break;
17571 }
17572 }
17573
17574 // We actually support i128, i16 and f16 as inline parameters
17575 // even if they are not reported as legal
17576 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
17577 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
17578 return std::pair(0U, RC);
17579
17580 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
17581 if (Kind != '\0') {
17582 if (Kind == 'v') {
17583 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17584 } else if (Kind == 's') {
17585 RC = &AMDGPU::SGPR_32RegClass;
17586 } else if (Kind == 'a') {
17587 RC = &AMDGPU::AGPR_32RegClass;
17588 }
17589
17590 if (RC) {
17591 if (NumRegs > 1) {
17592 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
17593 return std::pair(0U, nullptr);
17594
17595 uint32_t Width = NumRegs * 32;
17596 // Prohibit constraints for register ranges with a width that does not
17597 // match the required type.
17598 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
17599 return std::pair(0U, nullptr);
17600
17601 MCRegister Reg = RC->getRegister(Idx);
17603 RC = TRI->getVGPRClassForBitWidth(Width);
17604 else if (SIRegisterInfo::isSGPRClass(RC))
17605 RC = TRI->getSGPRClassForBitWidth(Width);
17606 else if (SIRegisterInfo::isAGPRClass(RC))
17607 RC = TRI->getAGPRClassForBitWidth(Width);
17608 if (RC) {
17609 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17610 if (!Reg) {
17611 // The register class does not contain the requested register,
17612 // e.g., because it is an SGPR pair that would violate alignment
17613 // requirements.
17614 return std::pair(0U, nullptr);
17615 }
17616 return std::pair(Reg, RC);
17617 }
17618 }
17619
17620 // Check for lossy scalar/vector conversions.
17621 if (VT.isVector() && VT.getSizeInBits() != 32)
17622 return std::pair(0U, nullptr);
17623 if (Idx < RC->getNumRegs())
17624 return std::pair(RC->getRegister(Idx), RC);
17625 return std::pair(0U, nullptr);
17626 }
17627 }
17628
17629 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
17630 if (Ret.first)
17631 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
17632
17633 return Ret;
17634}
17635
17636static bool isImmConstraint(StringRef Constraint) {
17637 if (Constraint.size() == 1) {
17638 switch (Constraint[0]) {
17639 default:
17640 break;
17641 case 'I':
17642 case 'J':
17643 case 'A':
17644 case 'B':
17645 case 'C':
17646 return true;
17647 }
17648 } else if (Constraint == "DA" || Constraint == "DB") {
17649 return true;
17650 }
17651 return false;
17652}
17653
17656 if (Constraint.size() == 1) {
17657 switch (Constraint[0]) {
17658 default:
17659 break;
17660 case 's':
17661 case 'v':
17662 case 'a':
17663 return C_RegisterClass;
17664 }
17665 } else if (Constraint.size() == 2) {
17666 if (Constraint == "VA")
17667 return C_RegisterClass;
17668 }
17669 if (isImmConstraint(Constraint)) {
17670 return C_Other;
17671 }
17672 return TargetLowering::getConstraintType(Constraint);
17673}
17674
17675static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
17677 Val = Val & maskTrailingOnes<uint64_t>(Size);
17678 }
17679 return Val;
17680}
17681
17683 StringRef Constraint,
17684 std::vector<SDValue> &Ops,
17685 SelectionDAG &DAG) const {
17686 if (isImmConstraint(Constraint)) {
17687 uint64_t Val;
17688 if (getAsmOperandConstVal(Op, Val) &&
17689 checkAsmConstraintVal(Op, Constraint, Val)) {
17690 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
17691 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
17692 }
17693 } else {
17695 }
17696}
17697
17699 unsigned Size = Op.getScalarValueSizeInBits();
17700 if (Size > 64)
17701 return false;
17702
17703 if (Size == 16 && !Subtarget->has16BitInsts())
17704 return false;
17705
17707 Val = C->getSExtValue();
17708 return true;
17709 }
17711 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17712 return true;
17713 }
17715 if (Size != 16 || Op.getNumOperands() != 2)
17716 return false;
17717 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
17718 return false;
17719 if (ConstantSDNode *C = V->getConstantSplatNode()) {
17720 Val = C->getSExtValue();
17721 return true;
17722 }
17723 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
17724 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
17725 return true;
17726 }
17727 }
17728
17729 return false;
17730}
17731
17733 uint64_t Val) const {
17734 if (Constraint.size() == 1) {
17735 switch (Constraint[0]) {
17736 case 'I':
17738 case 'J':
17739 return isInt<16>(Val);
17740 case 'A':
17741 return checkAsmConstraintValA(Op, Val);
17742 case 'B':
17743 return isInt<32>(Val);
17744 case 'C':
17745 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
17747 default:
17748 break;
17749 }
17750 } else if (Constraint.size() == 2) {
17751 if (Constraint == "DA") {
17752 int64_t HiBits = static_cast<int32_t>(Val >> 32);
17753 int64_t LoBits = static_cast<int32_t>(Val);
17754 return checkAsmConstraintValA(Op, HiBits, 32) &&
17755 checkAsmConstraintValA(Op, LoBits, 32);
17756 }
17757 if (Constraint == "DB") {
17758 return true;
17759 }
17760 }
17761 llvm_unreachable("Invalid asm constraint");
17762}
17763
17765 unsigned MaxSize) const {
17766 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
17767 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17768 if (Size == 16) {
17769 MVT VT = Op.getSimpleValueType();
17770 switch (VT.SimpleTy) {
17771 default:
17772 return false;
17773 case MVT::i16:
17774 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
17775 case MVT::f16:
17776 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
17777 case MVT::bf16:
17778 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
17779 case MVT::v2i16:
17780 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
17781 case MVT::v2f16:
17782 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
17783 case MVT::v2bf16:
17784 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
17785 }
17786 }
17787 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
17788 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
17789 return true;
17790 return false;
17791}
17792
17793static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
17794 switch (UnalignedClassID) {
17795 case AMDGPU::VReg_64RegClassID:
17796 return AMDGPU::VReg_64_Align2RegClassID;
17797 case AMDGPU::VReg_96RegClassID:
17798 return AMDGPU::VReg_96_Align2RegClassID;
17799 case AMDGPU::VReg_128RegClassID:
17800 return AMDGPU::VReg_128_Align2RegClassID;
17801 case AMDGPU::VReg_160RegClassID:
17802 return AMDGPU::VReg_160_Align2RegClassID;
17803 case AMDGPU::VReg_192RegClassID:
17804 return AMDGPU::VReg_192_Align2RegClassID;
17805 case AMDGPU::VReg_224RegClassID:
17806 return AMDGPU::VReg_224_Align2RegClassID;
17807 case AMDGPU::VReg_256RegClassID:
17808 return AMDGPU::VReg_256_Align2RegClassID;
17809 case AMDGPU::VReg_288RegClassID:
17810 return AMDGPU::VReg_288_Align2RegClassID;
17811 case AMDGPU::VReg_320RegClassID:
17812 return AMDGPU::VReg_320_Align2RegClassID;
17813 case AMDGPU::VReg_352RegClassID:
17814 return AMDGPU::VReg_352_Align2RegClassID;
17815 case AMDGPU::VReg_384RegClassID:
17816 return AMDGPU::VReg_384_Align2RegClassID;
17817 case AMDGPU::VReg_512RegClassID:
17818 return AMDGPU::VReg_512_Align2RegClassID;
17819 case AMDGPU::VReg_1024RegClassID:
17820 return AMDGPU::VReg_1024_Align2RegClassID;
17821 case AMDGPU::AReg_64RegClassID:
17822 return AMDGPU::AReg_64_Align2RegClassID;
17823 case AMDGPU::AReg_96RegClassID:
17824 return AMDGPU::AReg_96_Align2RegClassID;
17825 case AMDGPU::AReg_128RegClassID:
17826 return AMDGPU::AReg_128_Align2RegClassID;
17827 case AMDGPU::AReg_160RegClassID:
17828 return AMDGPU::AReg_160_Align2RegClassID;
17829 case AMDGPU::AReg_192RegClassID:
17830 return AMDGPU::AReg_192_Align2RegClassID;
17831 case AMDGPU::AReg_256RegClassID:
17832 return AMDGPU::AReg_256_Align2RegClassID;
17833 case AMDGPU::AReg_512RegClassID:
17834 return AMDGPU::AReg_512_Align2RegClassID;
17835 case AMDGPU::AReg_1024RegClassID:
17836 return AMDGPU::AReg_1024_Align2RegClassID;
17837 default:
17838 return -1;
17839 }
17840}
17841
17842// Figure out which registers should be reserved for stack access. Only after
17843// the function is legalized do we know all of the non-spill stack objects or if
17844// calls are present.
17848 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17849 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
17850 const SIInstrInfo *TII = ST.getInstrInfo();
17851
17852 if (Info->isEntryFunction()) {
17853 // Callable functions have fixed registers used for stack access.
17855 }
17856
17857 // TODO: Move this logic to getReservedRegs()
17858 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
17859 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17860 Register SReg = ST.isWave32()
17861 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17862 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
17863 &AMDGPU::SGPR_64RegClass);
17864 Info->setSGPRForEXECCopy(SReg);
17865
17866 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
17867 Info->getStackPtrOffsetReg()));
17868 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17869 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17870
17871 // We need to worry about replacing the default register with itself in case
17872 // of MIR testcases missing the MFI.
17873 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17874 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17875
17876 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17877 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17878
17879 Info->limitOccupancy(MF);
17880
17881 if (ST.isWave32() && !MF.empty()) {
17882 for (auto &MBB : MF) {
17883 for (auto &MI : MBB) {
17884 TII->fixImplicitOperands(MI);
17885 }
17886 }
17887 }
17888
17889 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
17890 // classes if required. Ideally the register class constraints would differ
17891 // per-subtarget, but there's no easy way to achieve that right now. This is
17892 // not a problem for VGPRs because the correctly aligned VGPR class is implied
17893 // from using them as the register class for legal types.
17894 if (ST.needsAlignedVGPRs()) {
17895 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
17896 const Register Reg = Register::index2VirtReg(I);
17897 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
17898 if (!RC)
17899 continue;
17900 int NewClassID = getAlignedAGPRClassID(RC->getID());
17901 if (NewClassID != -1)
17902 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
17903 }
17904 }
17905
17907}
17908
17910 KnownBits &Known,
17911 const APInt &DemandedElts,
17912 const SelectionDAG &DAG,
17913 unsigned Depth) const {
17914 Known.resetAll();
17915 unsigned Opc = Op.getOpcode();
17916 switch (Opc) {
17918 unsigned IID = Op.getConstantOperandVal(0);
17919 switch (IID) {
17920 case Intrinsic::amdgcn_mbcnt_lo:
17921 case Intrinsic::amdgcn_mbcnt_hi: {
17922 const GCNSubtarget &ST =
17924 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
17925 // most 31 + src1.
17926 Known.Zero.setBitsFrom(
17927 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17928 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
17929 Known = KnownBits::add(Known, Known2);
17930 return;
17931 }
17932 }
17933 break;
17934 }
17935 }
17937 Op, Known, DemandedElts, DAG, Depth);
17938}
17939
17941 const int FI, KnownBits &Known, const MachineFunction &MF) const {
17943
17944 // Set the high bits to zero based on the maximum allowed scratch size per
17945 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
17946 // calculation won't overflow, so assume the sign bit is never set.
17947 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
17948}
17949
17951 GISelValueTracking &VT, KnownBits &Known,
17952 unsigned Dim) {
17953 unsigned MaxValue =
17954 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
17955 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
17956}
17957
17959 KnownBits &Known, const APInt &DemandedElts,
17960 unsigned BFEWidth, bool SExt, unsigned Depth) {
17962 const MachineOperand &Src1 = MI.getOperand(2);
17963
17964 unsigned Src1Cst = 0;
17965 if (Src1.isImm()) {
17966 Src1Cst = Src1.getImm();
17967 } else if (Src1.isReg()) {
17968 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
17969 if (!Cst)
17970 return;
17971 Src1Cst = Cst->Value.getZExtValue();
17972 } else {
17973 return;
17974 }
17975
17976 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
17977 // Width is always [22:16].
17978 const unsigned Offset =
17979 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
17980 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
17981
17982 if (Width >= BFEWidth) // Ill-formed.
17983 return;
17984
17985 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
17986 Depth + 1);
17987
17988 Known = Known.extractBits(Width, Offset);
17989
17990 if (SExt)
17991 Known = Known.sext(BFEWidth);
17992 else
17993 Known = Known.zext(BFEWidth);
17994}
17995
17997 GISelValueTracking &VT, Register R, KnownBits &Known,
17998 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
17999 unsigned Depth) const {
18000 Known.resetAll();
18001 const MachineInstr *MI = MRI.getVRegDef(R);
18002 switch (MI->getOpcode()) {
18003 case AMDGPU::S_BFE_I32:
18004 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18005 /*SExt=*/true, Depth);
18006 case AMDGPU::S_BFE_U32:
18007 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
18008 /*SExt=*/false, Depth);
18009 case AMDGPU::S_BFE_I64:
18010 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18011 /*SExt=*/true, Depth);
18012 case AMDGPU::S_BFE_U64:
18013 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
18014 /*SExt=*/false, Depth);
18015 case AMDGPU::G_INTRINSIC:
18016 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18017 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
18018 switch (IID) {
18019 case Intrinsic::amdgcn_workitem_id_x:
18020 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
18021 break;
18022 case Intrinsic::amdgcn_workitem_id_y:
18023 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
18024 break;
18025 case Intrinsic::amdgcn_workitem_id_z:
18026 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
18027 break;
18028 case Intrinsic::amdgcn_mbcnt_lo:
18029 case Intrinsic::amdgcn_mbcnt_hi: {
18030 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
18031 // most 31 + src1.
18032 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
18033 ? getSubtarget()->getWavefrontSizeLog2()
18034 : 5);
18035 KnownBits Known2;
18036 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
18037 Depth + 1);
18038 Known = KnownBits::add(Known, Known2);
18039 break;
18040 }
18041 case Intrinsic::amdgcn_groupstaticsize: {
18042 // We can report everything over the maximum size as 0. We can't report
18043 // based on the actual size because we don't know if it's accurate or not
18044 // at any given point.
18045 Known.Zero.setHighBits(
18046 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
18047 break;
18048 }
18049 }
18050 break;
18051 }
18052 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18053 Known.Zero.setHighBits(24);
18054 break;
18055 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18056 Known.Zero.setHighBits(16);
18057 break;
18058 case AMDGPU::G_AMDGPU_SMED3:
18059 case AMDGPU::G_AMDGPU_UMED3: {
18060 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
18061
18062 KnownBits Known2;
18063 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
18064 if (Known2.isUnknown())
18065 break;
18066
18067 KnownBits Known1;
18068 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
18069 if (Known1.isUnknown())
18070 break;
18071
18072 KnownBits Known0;
18073 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
18074 if (Known0.isUnknown())
18075 break;
18076
18077 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
18078 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
18079 Known.One = Known0.One & Known1.One & Known2.One;
18080 break;
18081 }
18082 }
18083}
18084
18087 unsigned Depth) const {
18088 const MachineInstr *MI = MRI.getVRegDef(R);
18089 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
18090 // FIXME: Can this move to generic code? What about the case where the call
18091 // site specifies a lower alignment?
18092 Intrinsic::ID IID = GI->getIntrinsicID();
18094 AttributeList Attrs =
18095 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
18096 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
18097 return *RetAlign;
18098 }
18099 return Align(1);
18100}
18101
18104 const Align CacheLineAlign = Align(64);
18105
18106 // Pre-GFX10 target did not benefit from loop alignment
18107 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
18108 getSubtarget()->hasInstFwdPrefetchBug())
18109 return PrefAlign;
18110
18111 // On GFX10 I$ is 4 x 64 bytes cache lines.
18112 // By default prefetcher keeps one cache line behind and reads two ahead.
18113 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
18114 // behind and one ahead.
18115 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
18116 // If loop fits 64 bytes it always spans no more than two cache lines and
18117 // does not need an alignment.
18118 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
18119 // Else if loop is less or equal 192 bytes we need two lines behind.
18120
18122 const MachineBasicBlock *Header = ML->getHeader();
18123 if (Header->getAlignment() != PrefAlign)
18124 return Header->getAlignment(); // Already processed.
18125
18126 unsigned LoopSize = 0;
18127 for (const MachineBasicBlock *MBB : ML->blocks()) {
18128 // If inner loop block is aligned assume in average half of the alignment
18129 // size to be added as nops.
18130 if (MBB != Header)
18131 LoopSize += MBB->getAlignment().value() / 2;
18132
18133 for (const MachineInstr &MI : *MBB) {
18134 LoopSize += TII->getInstSizeInBytes(MI);
18135 if (LoopSize > 192)
18136 return PrefAlign;
18137 }
18138 }
18139
18140 if (LoopSize <= 64)
18141 return PrefAlign;
18142
18143 if (LoopSize <= 128)
18144 return CacheLineAlign;
18145
18146 // If any of parent loops is surrounded by prefetch instructions do not
18147 // insert new for inner loop, which would reset parent's settings.
18148 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
18149 if (MachineBasicBlock *Exit = P->getExitBlock()) {
18150 auto I = Exit->getFirstNonDebugInstr();
18151 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18152 return CacheLineAlign;
18153 }
18154 }
18155
18156 MachineBasicBlock *Pre = ML->getLoopPreheader();
18157 MachineBasicBlock *Exit = ML->getExitBlock();
18158
18159 if (Pre && Exit) {
18160 auto PreTerm = Pre->getFirstTerminator();
18161 if (PreTerm == Pre->begin() ||
18162 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18163 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18164 .addImm(1); // prefetch 2 lines behind PC
18165
18166 auto ExitHead = Exit->getFirstNonDebugInstr();
18167 if (ExitHead == Exit->end() ||
18168 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18169 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
18170 .addImm(2); // prefetch 1 line behind PC
18171 }
18172
18173 return CacheLineAlign;
18174}
18175
18177static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
18178 assert(N->getOpcode() == ISD::CopyFromReg);
18179 do {
18180 // Follow the chain until we find an INLINEASM node.
18181 N = N->getOperand(0).getNode();
18182 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
18183 return true;
18184 } while (N->getOpcode() == ISD::CopyFromReg);
18185 return false;
18186}
18187
18190 UniformityInfo *UA) const {
18191 switch (N->getOpcode()) {
18192 case ISD::CopyFromReg: {
18193 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
18194 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
18195 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18196 Register Reg = R->getReg();
18197
18198 // FIXME: Why does this need to consider isLiveIn?
18199 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
18200 return !TRI->isSGPRReg(MRI, Reg);
18201
18202 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
18203 return UA->isDivergent(V);
18204
18206 return !TRI->isSGPRReg(MRI, Reg);
18207 }
18208 case ISD::LOAD: {
18209 const LoadSDNode *L = cast<LoadSDNode>(N);
18210 unsigned AS = L->getAddressSpace();
18211 // A flat load may access private memory.
18213 }
18214 case ISD::CALLSEQ_END:
18215 return true;
18217 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
18219 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
18238 // Target-specific read-modify-write atomics are sources of divergence.
18239 return true;
18240 default:
18241 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
18242 // Generic read-modify-write atomics are sources of divergence.
18243 return A->readMem() && A->writeMem();
18244 }
18245 return false;
18246 }
18247}
18248
18250 EVT VT) const {
18251 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
18252 case MVT::f32:
18254 case MVT::f64:
18255 case MVT::f16:
18257 default:
18258 return false;
18259 }
18260}
18261
18263 LLT Ty, const MachineFunction &MF) const {
18264 switch (Ty.getScalarSizeInBits()) {
18265 case 32:
18266 return !denormalModeIsFlushAllF32(MF);
18267 case 64:
18268 case 16:
18269 return !denormalModeIsFlushAllF64F16(MF);
18270 default:
18271 return false;
18272 }
18273}
18274
18276 const APInt &DemandedElts,
18277 const SelectionDAG &DAG,
18278 bool SNaN,
18279 unsigned Depth) const {
18280 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
18281 const MachineFunction &MF = DAG.getMachineFunction();
18283
18284 if (Info->getMode().DX10Clamp)
18285 return true; // Clamped to 0.
18286 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
18287 }
18288
18290 DAG, SNaN, Depth);
18291}
18292
18293// On older subtargets, global FP atomic instructions have a hardcoded FP mode
18294// and do not support FP32 denormals, and only support v2f16/f64 denormals.
18296 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
18297 return true;
18298
18300 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
18301 if (DenormMode == DenormalMode::getPreserveSign())
18302 return true;
18303
18304 // TODO: Remove this.
18305 return RMW->getFunction()
18306 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
18307 .getValueAsBool();
18308}
18309
18311 LLVMContext &Ctx = RMW->getContext();
18312 StringRef MemScope =
18313 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
18314
18315 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
18316 << "Hardware instruction generated for atomic "
18317 << RMW->getOperationName(RMW->getOperation())
18318 << " operation at memory scope " << MemScope;
18319}
18320
18321static bool isV2F16OrV2BF16(Type *Ty) {
18322 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
18323 Type *EltTy = VT->getElementType();
18324 return VT->getNumElements() == 2 &&
18325 (EltTy->isHalfTy() || EltTy->isBFloatTy());
18326 }
18327
18328 return false;
18329}
18330
18331static bool isV2F16(Type *Ty) {
18333 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
18334}
18335
18336static bool isV2BF16(Type *Ty) {
18338 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
18339}
18340
18341/// \return true if atomicrmw integer ops work for the type.
18342static bool isAtomicRMWLegalIntTy(Type *Ty) {
18343 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
18344 unsigned BW = IT->getBitWidth();
18345 return BW == 32 || BW == 64;
18346 }
18347
18348 return false;
18349}
18350
18351/// \return true if this atomicrmw xchg type can be selected.
18352static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
18353 Type *Ty = RMW->getType();
18354 if (isAtomicRMWLegalIntTy(Ty))
18355 return true;
18356
18357 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
18358 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
18359 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
18360 return BW == 32 || BW == 64;
18361 }
18362
18363 if (Ty->isFloatTy() || Ty->isDoubleTy())
18364 return true;
18365
18367 return VT->getNumElements() == 2 &&
18368 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18369 }
18370
18371 return false;
18372}
18373
18374/// \returns true if it's valid to emit a native instruction for \p RMW, based
18375/// on the properties of the target memory.
18376static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
18377 const AtomicRMWInst *RMW,
18378 bool HasSystemScope) {
18379 // The remote/fine-grained access logic is different from the integer
18380 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
18381 // fine-grained access does not work, even for a device local allocation.
18382 //
18383 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
18384 // allocations work.
18385 if (HasSystemScope) {
18387 RMW->hasMetadata("amdgpu.no.remote.memory"))
18388 return true;
18389 if (Subtarget.hasEmulatedSystemScopeAtomics())
18390 return true;
18392 return true;
18393
18394 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
18395}
18396
18397/// \return Action to perform on AtomicRMWInsts for integer operations.
18404
18405/// Return if a flat address space atomicrmw can access private memory.
18407 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
18408 return !MD ||
18410}
18411
18419
18422 unsigned AS = RMW->getPointerAddressSpace();
18423 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
18425
18426 // 64-bit flat atomics that dynamically reside in private memory will silently
18427 // be dropped.
18428 //
18429 // Note that we will emit a new copy of the original atomic in the expansion,
18430 // which will be incrementally relegalized.
18431 const DataLayout &DL = RMW->getFunction()->getDataLayout();
18432 if (AS == AMDGPUAS::FLAT_ADDRESS &&
18433 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
18436
18437 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
18439 ORE.emit([=]() {
18440 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
18441 });
18442 return Kind;
18443 };
18444
18445 auto SSID = RMW->getSyncScopeID();
18446 bool HasSystemScope =
18447 SSID == SyncScope::System ||
18448 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
18449
18450 auto Op = RMW->getOperation();
18451 switch (Op) {
18453 // PCIe supports add and xchg for system atomics.
18454 return isAtomicRMWLegalXChgTy(RMW)
18457 case AtomicRMWInst::Add:
18458 // PCIe supports add and xchg for system atomics.
18460 case AtomicRMWInst::Sub:
18461 case AtomicRMWInst::And:
18462 case AtomicRMWInst::Or:
18463 case AtomicRMWInst::Xor:
18464 case AtomicRMWInst::Max:
18465 case AtomicRMWInst::Min:
18472 if (Subtarget->hasEmulatedSystemScopeAtomics())
18474
18475 // On most subtargets, for atomicrmw operations other than add/xchg,
18476 // whether or not the instructions will behave correctly depends on where
18477 // the address physically resides and what interconnect is used in the
18478 // system configuration. On some some targets the instruction will nop,
18479 // and in others synchronization will only occur at degraded device scope.
18480 //
18481 // If the allocation is known local to the device, the instructions should
18482 // work correctly.
18483 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
18485
18486 // If fine-grained remote memory works at device scope, we don't need to
18487 // do anything.
18488 if (!HasSystemScope &&
18489 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18491
18492 // If we are targeting a remote allocated address, it depends what kind of
18493 // allocation the address belongs to.
18494 //
18495 // If the allocation is fine-grained (in host memory, or in PCIe peer
18496 // device memory), the operation will fail depending on the target.
18497 //
18498 // Note fine-grained host memory access does work on APUs or if XGMI is
18499 // used, but we do not know if we are targeting an APU or the system
18500 // configuration from the ISA version/target-cpu.
18501 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
18503
18506 // Atomic sub/or/xor do not work over PCI express, but atomic add
18507 // does. InstCombine transforms these with 0 to or, so undo that.
18508 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
18509 ConstVal && ConstVal->isNullValue())
18511 }
18512
18513 // If the allocation could be in remote, fine-grained memory, the rmw
18514 // instructions may fail. cmpxchg should work, so emit that. On some
18515 // system configurations, PCIe atomics aren't supported so cmpxchg won't
18516 // even work, so you're out of luck anyway.
18517
18518 // In summary:
18519 //
18520 // Cases that may fail:
18521 // - fine-grained pinned host memory
18522 // - fine-grained migratable host memory
18523 // - fine-grained PCIe peer device
18524 //
18525 // Cases that should work, but may be treated overly conservatively.
18526 // - fine-grained host memory on an APU
18527 // - fine-grained XGMI peer device
18529 }
18530
18532 }
18533 case AtomicRMWInst::FAdd: {
18534 Type *Ty = RMW->getType();
18535
18536 // TODO: Handle REGION_ADDRESS
18537 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18538 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
18539 // is fixed to round-to-nearest-even.
18540 //
18541 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
18542 // round-to-nearest-even.
18543 //
18544 // We ignore the rounding mode problem, even in strictfp. The C++ standard
18545 // suggests it is OK if the floating-point mode may not match the calling
18546 // thread.
18547 if (Ty->isFloatTy()) {
18548 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
18550 }
18551
18552 if (Ty->isDoubleTy()) {
18553 // Ignores denormal mode, but we don't consider flushing mandatory.
18554 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
18556 }
18557
18558 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18560
18562 }
18563
18564 // LDS atomics respect the denormal mode from the mode register.
18565 //
18566 // Traditionally f32 global/buffer memory atomics would unconditionally
18567 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
18568 // flush.
18569 //
18570 // On targets with flat atomic fadd, denormals would flush depending on
18571 // whether the target address resides in LDS or global memory. We consider
18572 // this flat-maybe-flush as will-flush.
18573 if (Ty->isFloatTy() &&
18574 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18577
18578 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
18579 // safe. The message phrasing also should be better.
18580 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18581 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18582 // gfx942, gfx12
18583 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
18584 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18585 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
18586 // gfx90a, gfx942, gfx12
18587 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18588 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18589
18590 // gfx942, gfx12
18591 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
18592 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18593 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
18594 // gfx90a, gfx942, gfx12
18595 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
18596 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18597
18598 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
18599 // buffer. gfx12 does have the buffer version.
18600 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
18601 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18602 }
18603
18604 // global and flat atomic fadd f64: gfx90a, gfx942.
18605 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18606 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18607
18608 if (AS != AMDGPUAS::FLAT_ADDRESS) {
18609 if (Ty->isFloatTy()) {
18610 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
18611 // gfx11+.
18612 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18613 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18614 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
18615 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18616 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18617 } else {
18618 // gfx908
18619 if (RMW->use_empty() &&
18620 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18621 isV2F16(Ty))
18622 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18623 }
18624 }
18625
18626 // flat atomic fadd f32: gfx942, gfx11+.
18627 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
18628 if (Subtarget->hasFlatAtomicFaddF32Inst())
18629 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18630
18631 // If it is in flat address space, and the type is float, we will try to
18632 // expand it, if the target supports global and lds atomic fadd. The
18633 // reason we need that is, in the expansion, we emit the check of
18634 // address space. If it is in global address space, we emit the global
18635 // atomic fadd; if it is in shared address space, we emit the LDS atomic
18636 // fadd.
18637 if (Subtarget->hasLDSFPAtomicAddF32()) {
18638 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18640 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18642 }
18643 }
18644 }
18645
18647 }
18649 case AtomicRMWInst::FMax: {
18650 Type *Ty = RMW->getType();
18651
18652 // LDS float and double fmin/fmax were always supported.
18653 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
18654 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
18656 }
18657
18658 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
18659 // For flat and global cases:
18660 // float, double in gfx7. Manual claims denormal support.
18661 // Removed in gfx8.
18662 // float, double restored in gfx10.
18663 // double removed again in gfx11, so only f32 for gfx11/gfx12.
18664 //
18665 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
18666 // no f32.
18667 if (AS == AMDGPUAS::FLAT_ADDRESS) {
18668 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18669 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18670 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18671 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18672 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
18674 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18675 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18676 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18677 return ReportUnsafeHWInst(AtomicExpansionKind::None);
18678 }
18679 }
18680
18682 }
18685 default:
18687 }
18688
18689 llvm_unreachable("covered atomicrmw op switch");
18690}
18691
18698
18705
18708 unsigned AddrSpace = CmpX->getPointerAddressSpace();
18709 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
18711
18712 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
18714
18715 const DataLayout &DL = CmpX->getDataLayout();
18716
18717 Type *ValTy = CmpX->getNewValOperand()->getType();
18718
18719 // If a 64-bit flat atomic may alias private, we need to avoid using the
18720 // atomic in the private case.
18721 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
18723}
18724
18725const TargetRegisterClass *
18726SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
18728 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
18729 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18730 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18731 : &AMDGPU::SReg_32RegClass;
18732 if (!TRI->isSGPRClass(RC) && !isDivergent)
18733 return TRI->getEquivalentSGPRClass(RC);
18734 if (TRI->isSGPRClass(RC) && isDivergent)
18735 return TRI->getEquivalentVGPRClass(RC);
18736
18737 return RC;
18738}
18739
18740// FIXME: This is a workaround for DivergenceAnalysis not understanding always
18741// uniform values (as produced by the mask results of control flow intrinsics)
18742// used outside of divergent blocks. The phi users need to also be treated as
18743// always uniform.
18744//
18745// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
18746static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
18747 unsigned WaveSize) {
18748 // FIXME: We assume we never cast the mask results of a control flow
18749 // intrinsic.
18750 // Early exit if the type won't be consistent as a compile time hack.
18751 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
18752 if (!IT || IT->getBitWidth() != WaveSize)
18753 return false;
18754
18755 if (!isa<Instruction>(V))
18756 return false;
18757 if (!Visited.insert(V).second)
18758 return false;
18759 bool Result = false;
18760 for (const auto *U : V->users()) {
18762 if (V == U->getOperand(1)) {
18763 switch (Intrinsic->getIntrinsicID()) {
18764 default:
18765 Result = false;
18766 break;
18767 case Intrinsic::amdgcn_if_break:
18768 case Intrinsic::amdgcn_if:
18769 case Intrinsic::amdgcn_else:
18770 Result = true;
18771 break;
18772 }
18773 }
18774 if (V == U->getOperand(0)) {
18775 switch (Intrinsic->getIntrinsicID()) {
18776 default:
18777 Result = false;
18778 break;
18779 case Intrinsic::amdgcn_end_cf:
18780 case Intrinsic::amdgcn_loop:
18781 Result = true;
18782 break;
18783 }
18784 }
18785 } else {
18786 Result = hasCFUser(U, Visited, WaveSize);
18787 }
18788 if (Result)
18789 break;
18790 }
18791 return Result;
18792}
18793
18795 const Value *V) const {
18796 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
18797 if (CI->isInlineAsm()) {
18798 // FIXME: This cannot give a correct answer. This should only trigger in
18799 // the case where inline asm returns mixed SGPR and VGPR results, used
18800 // outside the defining block. We don't have a specific result to
18801 // consider, so this assumes if any value is SGPR, the overall register
18802 // also needs to be SGPR.
18803 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
18805 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
18806 for (auto &TC : TargetConstraints) {
18807 if (TC.Type == InlineAsm::isOutput) {
18809 const TargetRegisterClass *RC =
18810 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
18811 TC.ConstraintVT)
18812 .second;
18813 if (RC && SIRI->isSGPRClass(RC))
18814 return true;
18815 }
18816 }
18817 }
18818 }
18820 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18821}
18822
18824 for (SDUse &Use : N->uses()) {
18826 if (getBasePtrIndex(M) == Use.getOperandNo())
18827 return true;
18828 }
18829 }
18830 return false;
18831}
18832
18834 SDValue N1) const {
18835 if (!N0.hasOneUse())
18836 return false;
18837 // Take care of the opportunity to keep N0 uniform
18838 if (N0->isDivergent() || !N1->isDivergent())
18839 return true;
18840 // Check if we have a good chance to form the memory access pattern with the
18841 // base and offset
18842 return (DAG.isBaseWithConstantOffset(N0) &&
18844}
18845
18847 Register N0, Register N1) const {
18848 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
18849}
18850
18853 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
18855 if (I.getMetadata("amdgpu.noclobber"))
18856 Flags |= MONoClobber;
18857 if (I.getMetadata("amdgpu.last.use"))
18858 Flags |= MOLastUse;
18859 return Flags;
18860}
18861
18863 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
18864 const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const {
18865 if (User->getOpcode() != ISD::CopyToReg)
18866 return false;
18867 if (!Def->isMachineOpcode())
18868 return false;
18870 if (!MDef)
18871 return false;
18872
18873 unsigned ResNo = User->getOperand(Op).getResNo();
18874 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
18875 return false;
18876 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
18877 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
18878 PhysReg = AMDGPU::SCC;
18879 const TargetRegisterClass *RC =
18880 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
18881 Cost = RC->getCopyCost();
18882 return true;
18883 }
18884 return false;
18885}
18886
18888 Instruction *AI) const {
18889 // Given: atomicrmw fadd ptr %addr, float %val ordering
18890 //
18891 // With this expansion we produce the following code:
18892 // [...]
18893 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
18894 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
18895 //
18896 // atomicrmw.shared:
18897 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
18898 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
18899 // float %val ordering
18900 // br label %atomicrmw.phi
18901 //
18902 // atomicrmw.check.private:
18903 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
18904 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
18905 //
18906 // atomicrmw.private:
18907 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
18908 // %loaded.private = load float, ptr addrspace(5) %cast.private
18909 // %val.new = fadd float %loaded.private, %val
18910 // store float %val.new, ptr addrspace(5) %cast.private
18911 // br label %atomicrmw.phi
18912 //
18913 // atomicrmw.global:
18914 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
18915 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
18916 // float %val ordering
18917 // br label %atomicrmw.phi
18918 //
18919 // atomicrmw.phi:
18920 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
18921 // [ %loaded.private, %atomicrmw.private ],
18922 // [ %loaded.global, %atomicrmw.global ]
18923 // br label %atomicrmw.end
18924 //
18925 // atomicrmw.end:
18926 // [...]
18927 //
18928 //
18929 // For 64-bit atomics which may reside in private memory, we perform a simpler
18930 // version that only inserts the private check, and uses the flat operation.
18931
18932 IRBuilder<> Builder(AI);
18933 LLVMContext &Ctx = Builder.getContext();
18934
18935 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
18936 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
18938 Value *Addr = AI->getOperand(PtrOpIdx);
18939
18940 /// TODO: Only need to check private, then emit flat-known-not private (no
18941 /// need for shared block, or cast to global).
18943
18944 Align Alignment;
18945 if (RMW)
18946 Alignment = RMW->getAlign();
18947 else if (CX)
18948 Alignment = CX->getAlign();
18949 else
18950 llvm_unreachable("unhandled atomic operation");
18951
18952 // FullFlatEmulation is true if we need to issue the private, shared, and
18953 // global cases.
18954 //
18955 // If this is false, we are only dealing with the flat-targeting-private case,
18956 // where we only insert a check for private and still use the flat instruction
18957 // for global and shared.
18958
18959 bool FullFlatEmulation =
18960 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
18961 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18962 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18963 RMW->getType()->isDoubleTy()));
18964
18965 // If the return value isn't used, do not introduce a false use in the phi.
18966 bool ReturnValueIsUsed = !AI->use_empty();
18967
18968 BasicBlock *BB = Builder.GetInsertBlock();
18969 Function *F = BB->getParent();
18970 BasicBlock *ExitBB =
18971 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
18972 BasicBlock *SharedBB = nullptr;
18973
18974 BasicBlock *CheckPrivateBB = BB;
18975 if (FullFlatEmulation) {
18976 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
18977 CheckPrivateBB =
18978 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
18979 }
18980
18981 BasicBlock *PrivateBB =
18982 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
18983 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
18984 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
18985
18986 std::prev(BB->end())->eraseFromParent();
18987 Builder.SetInsertPoint(BB);
18988
18989 Value *LoadedShared = nullptr;
18990 if (FullFlatEmulation) {
18991 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18992 {Addr}, nullptr, "is.shared");
18993 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18994 Builder.SetInsertPoint(SharedBB);
18995 Value *CastToLocal = Builder.CreateAddrSpaceCast(
18997
18998 Instruction *Clone = AI->clone();
18999 Clone->insertInto(SharedBB, SharedBB->end());
19000 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
19001 LoadedShared = Clone;
19002
19003 Builder.CreateBr(PhiBB);
19004 Builder.SetInsertPoint(CheckPrivateBB);
19005 }
19006
19007 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19008 {Addr}, nullptr, "is.private");
19009 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19010
19011 Builder.SetInsertPoint(PrivateBB);
19012
19013 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19015
19016 Value *LoadedPrivate;
19017 if (RMW) {
19018 LoadedPrivate = Builder.CreateAlignedLoad(
19019 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
19020
19021 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
19022 LoadedPrivate, RMW->getValOperand());
19023
19024 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19025 } else {
19026 auto [ResultLoad, Equal] =
19027 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
19028 CX->getNewValOperand(), CX->getAlign());
19029
19030 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
19031 ResultLoad, 0);
19032 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19033 }
19034
19035 Builder.CreateBr(PhiBB);
19036
19037 Builder.SetInsertPoint(GlobalBB);
19038
19039 // Continue using a flat instruction if we only emitted the check for private.
19040 Instruction *LoadedGlobal = AI;
19041 if (FullFlatEmulation) {
19042 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19044 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
19045 }
19046
19047 AI->removeFromParent();
19048 AI->insertInto(GlobalBB, GlobalBB->end());
19049
19050 // The new atomicrmw may go through another round of legalization later.
19051 if (!FullFlatEmulation) {
19052 // We inserted the runtime check already, make sure we do not try to
19053 // re-expand this.
19054 // TODO: Should union with any existing metadata.
19055 MDBuilder MDB(F->getContext());
19056 MDNode *RangeNotPrivate =
19059 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
19060 RangeNotPrivate);
19061 }
19062
19063 Builder.CreateBr(PhiBB);
19064
19065 Builder.SetInsertPoint(PhiBB);
19066
19067 if (ReturnValueIsUsed) {
19068 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
19069 AI->replaceAllUsesWith(Loaded);
19070 if (FullFlatEmulation)
19071 Loaded->addIncoming(LoadedShared, SharedBB);
19072 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19073 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19074 Loaded->takeName(AI);
19075 }
19076
19077 Builder.CreateBr(ExitBB);
19078}
19079
19081 unsigned PtrOpIdx) {
19082 Value *PtrOp = I->getOperand(PtrOpIdx);
19085
19086 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
19087 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
19088 I->getIterator());
19089 I->setOperand(PtrOpIdx, ASCast);
19090}
19091
19094
19097
19100 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
19101 ConstVal && ConstVal->isNullValue()) {
19102 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
19104
19105 // We may still need the private-alias-flat handling below.
19106
19107 // TODO: Skip this for cases where we cannot access remote memory.
19108 }
19109 }
19110
19111 // The non-flat expansions should only perform the de-canonicalization of
19112 // identity values.
19114 return;
19115
19117}
19118
19125
19129
19131 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19132}
19133
19135 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
19136 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
19137
19139 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19140}
19141
19142LoadInst *
19144 IRBuilder<> Builder(AI);
19145 auto Order = AI->getOrdering();
19146
19147 // The optimization removes store aspect of the atomicrmw. Therefore, cache
19148 // must be flushed if the atomic ordering had a release semantics. This is
19149 // not necessary a fence, a release fence just coincides to do that flush.
19150 // Avoid replacing of an atomicrmw with a release semantics.
19151 if (isReleaseOrStronger(Order))
19152 return nullptr;
19153
19154 LoadInst *LI = Builder.CreateAlignedLoad(
19155 AI->getType(), AI->getPointerOperand(), AI->getAlign());
19156 LI->setAtomic(Order, AI->getSyncScopeID());
19157 LI->copyMetadata(*AI);
19158 LI->takeName(AI);
19159 AI->replaceAllUsesWith(LI);
19160 AI->eraseFromParent();
19161 return LI;
19162}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:298
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1254
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1251
static cl::opt< bool > UseSelectionDAGPTRADD("amdgpu-use-sdag-ptradd", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " "SelectionDAG ISel"), cl::init(false))
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1120
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:6057
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1497
bool isNegative() const
Definition APFloat.h:1449
bool isNormal() const
Definition APFloat.h:1453
APInt bitcastToAPInt() const
Definition APFloat.h:1353
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1138
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1098
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1079
bool isInfinity() const
Definition APFloat.h:1446
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1391
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1385
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:258
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:380
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:466
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1639
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:366
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:296
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1237
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1221
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:339
const Function * getParent() const
Definition Argument.h:44
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_NE
not equal
Definition InstrTypes.h:700
bool isSigned() const
Definition InstrTypes.h:932
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:772
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:778
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:214
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:208
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:803
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1077
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1441
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:221
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:218
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:67
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, MCRegister &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:420
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:258
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:107
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI void set(Value *Val)
Definition Value.h:905
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
const Use & getOperandUse(unsigned i) const
Definition User.h:245
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1101
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isZero() const
Definition TypeSize.h:154
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:801
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:774
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:587
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:765
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:835
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:862
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:571
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:738
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:826
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:656
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:773
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:528
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:535
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:778
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:952
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:695
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:756
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:636
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:601
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:563
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:832
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:793
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:870
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:718
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:787
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:908
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:730
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:552
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:941
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:838
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:815
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:521
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:543
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:241
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:174
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:232
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:289
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:396
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:186
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:282
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1712
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:222
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:288
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:159
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:164
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:241
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1877
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:86
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:853
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:304
static LLVM_ABI const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:256
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:225
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs