Thanks to visit codestin.com
Credit goes to llvm.org

LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
155/// debug location to the last instruction in the specified basic block if the
156/// insert point points to the end of the block.
159 Builder.restoreIP(IP);
160 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
162 if (!BB->empty() && I == BB->end())
163 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
164}
165
166static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
167 if (T.isAMDGPU()) {
168 StringRef Features =
169 Kernel->getFnAttribute("target-features").getValueAsString();
170 if (Features.count("+wavefrontsize64"))
173 }
174 if (T.isNVPTX())
176 if (T.isSPIRV())
178 llvm_unreachable("No grid value available for this architecture!");
179}
180
181/// Determine which scheduling algorithm to use, determined from schedule clause
182/// arguments.
183static OMPScheduleType
184getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
185 bool HasSimdModifier) {
186 // Currently, the default schedule it static.
187 switch (ClauseKind) {
188 case OMP_SCHEDULE_Default:
189 case OMP_SCHEDULE_Static:
190 return HasChunks ? OMPScheduleType::BaseStaticChunked
191 : OMPScheduleType::BaseStatic;
192 case OMP_SCHEDULE_Dynamic:
193 return OMPScheduleType::BaseDynamicChunked;
194 case OMP_SCHEDULE_Guided:
195 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
196 : OMPScheduleType::BaseGuidedChunked;
197 case OMP_SCHEDULE_Auto:
199 case OMP_SCHEDULE_Runtime:
200 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
201 : OMPScheduleType::BaseRuntime;
202 }
203 llvm_unreachable("unhandled schedule clause argument");
204}
205
206/// Adds ordering modifier flags to schedule type.
207static OMPScheduleType
209 bool HasOrderedClause) {
210 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
211 OMPScheduleType::None &&
212 "Must not have ordering nor monotonicity flags already set");
213
214 OMPScheduleType OrderingModifier = HasOrderedClause
215 ? OMPScheduleType::ModifierOrdered
216 : OMPScheduleType::ModifierUnordered;
217 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
218
219 // Unsupported combinations
220 if (OrderingScheduleType ==
221 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
222 return OMPScheduleType::OrderedGuidedChunked;
223 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
224 OMPScheduleType::ModifierOrdered))
225 return OMPScheduleType::OrderedRuntime;
226
227 return OrderingScheduleType;
228}
229
230/// Adds monotonicity modifier flags to schedule type.
231static OMPScheduleType
233 bool HasSimdModifier, bool HasMonotonic,
234 bool HasNonmonotonic, bool HasOrderedClause) {
235 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
236 OMPScheduleType::None &&
237 "Must not have monotonicity flags already set");
238 assert((!HasMonotonic || !HasNonmonotonic) &&
239 "Monotonic and Nonmonotonic are contradicting each other");
240
241 if (HasMonotonic) {
242 return ScheduleType | OMPScheduleType::ModifierMonotonic;
243 } else if (HasNonmonotonic) {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 } else {
246 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
247 // If the static schedule kind is specified or if the ordered clause is
248 // specified, and if the nonmonotonic modifier is not specified, the
249 // effect is as if the monotonic modifier is specified. Otherwise, unless
250 // the monotonic modifier is specified, the effect is as if the
251 // nonmonotonic modifier is specified.
252 OMPScheduleType BaseScheduleType =
253 ScheduleType & ~OMPScheduleType::ModifierMask;
254 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
255 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
256 HasOrderedClause) {
257 // The monotonic is used by default in openmp runtime library, so no need
258 // to set it.
259 return ScheduleType;
260 } else {
261 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
262 }
263 }
264}
265
266/// Determine the schedule type using schedule and ordering clause arguments.
267static OMPScheduleType
268computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
269 bool HasSimdModifier, bool HasMonotonicModifier,
270 bool HasNonmonotonicModifier, bool HasOrderedClause) {
271 OMPScheduleType BaseSchedule =
272 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
273 OMPScheduleType OrderedSchedule =
274 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
276 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
277 HasNonmonotonicModifier, HasOrderedClause);
278
280 return Result;
281}
282
283/// Make \p Source branch to \p Target.
284///
285/// Handles two situations:
286/// * \p Source already has an unconditional branch.
287/// * \p Source is a degenerate block (no terminator because the BB is
288/// the current head of the IR construction).
290 if (Instruction *Term = Source->getTerminator()) {
291 auto *Br = cast<BranchInst>(Term);
292 assert(!Br->isConditional() &&
293 "BB's terminator must be an unconditional branch (or degenerate)");
294 BasicBlock *Succ = Br->getSuccessor(0);
295 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
296 Br->setSuccessor(0, Target);
297 return;
298 }
299
300 auto *NewBr = BranchInst::Create(Target, Source);
301 NewBr->setDebugLoc(DL);
302}
303
304void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
305 bool CreateBranch, DebugLoc DL) {
306 assert(New->getFirstInsertionPt() == New->begin() &&
307 "Target BB must not have PHI nodes");
308
309 // Move instructions to new block.
310 BasicBlock *Old = IP.getBlock();
311 // If the `Old` block is empty then there are no instructions to move. But in
312 // the new debug scheme, it could have trailing debug records which will be
313 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
314 // reasons:
315 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
316 // 2. Even if `New` is not empty, the rationale to move those records to `New`
317 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
318 // assumes that `Old` is optimized out and is going away. This is not the case
319 // here. The `Old` block is still being used e.g. a branch instruction is
320 // added to it later in this function.
321 // So we call `BasicBlock::splice` only when `Old` is not empty.
322 if (!Old->empty())
323 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
324
325 if (CreateBranch) {
326 auto *NewBr = BranchInst::Create(New, Old);
327 NewBr->setDebugLoc(DL);
328 }
329}
330
331void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
333 BasicBlock *Old = Builder.GetInsertBlock();
334
335 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
336 if (CreateBranch)
337 Builder.SetInsertPoint(Old->getTerminator());
338 else
339 Builder.SetInsertPoint(Old);
340
341 // SetInsertPoint also updates the Builder's debug location, but we want to
342 // keep the one the Builder was configured to use.
343 Builder.SetCurrentDebugLocation(DebugLoc);
344}
345
346BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
347 DebugLoc DL, llvm::Twine Name) {
348 BasicBlock *Old = IP.getBlock();
350 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
351 Old->getParent(), Old->getNextNode());
352 spliceBB(IP, New, CreateBranch, DL);
353 New->replaceSuccessorsPhiUsesWith(Old, New);
354 return New;
355}
356
357BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
358 llvm::Twine Name) {
359 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
360 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
361 if (CreateBranch)
362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
363 else
364 Builder.SetInsertPoint(Builder.GetInsertBlock());
365 // SetInsertPoint also updates the Builder's debug location, but we want to
366 // keep the one the Builder was configured to use.
367 Builder.SetCurrentDebugLocation(DebugLoc);
368 return New;
369}
370
371BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
372 llvm::Twine Name) {
373 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
374 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
375 if (CreateBranch)
376 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
377 else
378 Builder.SetInsertPoint(Builder.GetInsertBlock());
379 // SetInsertPoint also updates the Builder's debug location, but we want to
380 // keep the one the Builder was configured to use.
381 Builder.SetCurrentDebugLocation(DebugLoc);
382 return New;
383}
384
385BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
386 llvm::Twine Suffix) {
387 BasicBlock *Old = Builder.GetInsertBlock();
388 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
389}
390
391// This function creates a fake integer value and a fake use for the integer
392// value. It returns the fake value created. This is useful in modeling the
393// extra arguments to the outlined functions.
395 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
397 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
398 const Twine &Name = "", bool AsPtr = true) {
399 Builder.restoreIP(OuterAllocaIP);
400 Instruction *FakeVal;
401 AllocaInst *FakeValAddr =
402 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
403 ToBeDeleted.push_back(FakeValAddr);
404
405 if (AsPtr) {
406 FakeVal = FakeValAddr;
407 } else {
408 FakeVal =
409 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
410 ToBeDeleted.push_back(FakeVal);
411 }
412
413 // Generate a fake use of this value
414 Builder.restoreIP(InnerAllocaIP);
415 Instruction *UseFakeVal;
416 if (AsPtr) {
417 UseFakeVal =
418 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
419 } else {
420 UseFakeVal =
421 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
422 }
423 ToBeDeleted.push_back(UseFakeVal);
424 return FakeVal;
425}
426
427//===----------------------------------------------------------------------===//
428// OpenMPIRBuilderConfig
429//===----------------------------------------------------------------------===//
430
431namespace {
433/// Values for bit flags for marking which requires clauses have been used.
434enum OpenMPOffloadingRequiresDirFlags {
435 /// flag undefined.
436 OMP_REQ_UNDEFINED = 0x000,
437 /// no requires directive present.
438 OMP_REQ_NONE = 0x001,
439 /// reverse_offload clause.
440 OMP_REQ_REVERSE_OFFLOAD = 0x002,
441 /// unified_address clause.
442 OMP_REQ_UNIFIED_ADDRESS = 0x004,
443 /// unified_shared_memory clause.
444 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
445 /// dynamic_allocators clause.
446 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
447 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
448};
449
450} // anonymous namespace
451
452OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
453 : RequiresFlags(OMP_REQ_UNDEFINED) {}
454
455OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
456 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
457 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
458 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
459 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
460 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
461 RequiresFlags(OMP_REQ_UNDEFINED) {
462 if (HasRequiresReverseOffload)
463 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
464 if (HasRequiresUnifiedAddress)
465 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
466 if (HasRequiresUnifiedSharedMemory)
467 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
468 if (HasRequiresDynamicAllocators)
469 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
470}
471
472bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
473 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
474}
475
476bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
477 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
478}
479
480bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
481 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
484bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
485 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
486}
487
488int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
489 return hasRequiresFlags() ? RequiresFlags
490 : static_cast<int64_t>(OMP_REQ_NONE);
491}
492
493void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
494 if (Value)
495 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
496 else
497 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
503 else
504 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
517 else
518 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
519}
520
521//===----------------------------------------------------------------------===//
522// OpenMPIRBuilder
523//===----------------------------------------------------------------------===//
524
525void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
526 IRBuilderBase &Builder,
527 SmallVector<Value *> &ArgsVector) {
528 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
529 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
530 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
531 constexpr const size_t MaxDim = 3;
532 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
533 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
534
535 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
536
537 Value *NumTeams3D =
538 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
539 Value *NumThreads3D =
540 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
541 for (unsigned I :
542 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
543 NumTeams3D =
544 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
545 for (unsigned I :
546 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
547 NumThreads3D =
548 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
549
550 ArgsVector = {Version,
551 PointerNum,
552 KernelArgs.RTArgs.BasePointersArray,
553 KernelArgs.RTArgs.PointersArray,
554 KernelArgs.RTArgs.SizesArray,
555 KernelArgs.RTArgs.MapTypesArray,
556 KernelArgs.RTArgs.MapNamesArray,
557 KernelArgs.RTArgs.MappersArray,
558 KernelArgs.NumIterations,
559 Flags,
560 NumTeams3D,
561 NumThreads3D,
562 KernelArgs.DynCGGroupMem};
563}
564
565void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
566 LLVMContext &Ctx = Fn.getContext();
567
568 // Get the function's current attributes.
569 auto Attrs = Fn.getAttributes();
570 auto FnAttrs = Attrs.getFnAttrs();
571 auto RetAttrs = Attrs.getRetAttrs();
573 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
574 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
575
576 // Add AS to FnAS while taking special care with integer extensions.
577 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
578 bool Param = true) -> void {
579 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
580 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
581 if (HasSignExt || HasZeroExt) {
582 assert(AS.getNumAttributes() == 1 &&
583 "Currently not handling extension attr combined with others.");
584 if (Param) {
585 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else if (auto AK =
588 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
589 FnAS = FnAS.addAttribute(Ctx, AK);
590 } else {
591 FnAS = FnAS.addAttributes(Ctx, AS);
592 }
593 };
594
595#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597
598 // Add attributes to the function declaration.
599 switch (FnID) {
600#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
601 case Enum: \
602 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
603 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
604 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
605 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
606 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 default:
610 // Attributes are optional.
611 break;
612 }
613}
614
616OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
617 FunctionType *FnTy = nullptr;
618 Function *Fn = nullptr;
619
620 // Try to find the declation in the module first.
621 switch (FnID) {
622#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
623 case Enum: \
624 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
625 IsVarArg); \
626 Fn = M.getFunction(Str); \
627 break;
628#include "llvm/Frontend/OpenMP/OMPKinds.def"
629 }
630
631 if (!Fn) {
632 // Create a new declaration if we need one.
633 switch (FnID) {
634#define OMP_RTL(Enum, Str, ...) \
635 case Enum: \
636 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
637 break;
638#include "llvm/Frontend/OpenMP/OMPKinds.def"
639 }
640
641 // Add information if the runtime function takes a callback function
642 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
643 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
644 LLVMContext &Ctx = Fn->getContext();
645 MDBuilder MDB(Ctx);
646 // Annotate the callback behavior of the runtime function:
647 // - The callback callee is argument number 2 (microtask).
648 // - The first two arguments of the callback callee are unknown (-1).
649 // - All variadic arguments to the runtime function are passed to the
650 // callback callee.
651 Fn->addMetadata(
652 LLVMContext::MD_callback,
653 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
654 2, {-1, -1}, /* VarArgsArePassed */ true)}));
655 }
656 }
657
658 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
659 << " with type " << *Fn->getFunctionType() << "\n");
660 addAttributes(FnID, *Fn);
661
662 } else {
663 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
664 << " with type " << *Fn->getFunctionType() << "\n");
665 }
666
667 assert(Fn && "Failed to create OpenMP runtime function");
668
669 return {FnTy, Fn};
670}
671
672Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
673 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
674 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
675 assert(Fn && "Failed to create OpenMP runtime function pointer");
676 return Fn;
677}
678
679void OpenMPIRBuilder::initialize() { initializeTypes(M); }
680
683 BasicBlock &EntryBlock = Function->getEntryBlock();
684 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
685
686 // Loop over blocks looking for constant allocas, skipping the entry block
687 // as any allocas there are already in the desired location.
688 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
689 Block++) {
690 for (auto Inst = Block->getReverseIterator()->begin();
691 Inst != Block->getReverseIterator()->end();) {
693 Inst++;
695 continue;
696 AllocaInst->moveBeforePreserving(MoveLocInst);
697 } else {
698 Inst++;
699 }
700 }
701 }
702}
703
704void OpenMPIRBuilder::finalize(Function *Fn) {
705 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
707 SmallVector<OutlineInfo, 16> DeferredOutlines;
708 for (OutlineInfo &OI : OutlineInfos) {
709 // Skip functions that have not finalized yet; may happen with nested
710 // function generation.
711 if (Fn && OI.getFunction() != Fn) {
712 DeferredOutlines.push_back(OI);
713 continue;
714 }
715
716 ParallelRegionBlockSet.clear();
717 Blocks.clear();
718 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
719
720 Function *OuterFn = OI.getFunction();
721 CodeExtractorAnalysisCache CEAC(*OuterFn);
722 // If we generate code for the target device, we need to allocate
723 // struct for aggregate params in the device default alloca address space.
724 // OpenMP runtime requires that the params of the extracted functions are
725 // passed as zero address space pointers. This flag ensures that
726 // CodeExtractor generates correct code for extracted functions
727 // which are used by OpenMP runtime.
728 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
729 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
730 /* AggregateArgs */ true,
731 /* BlockFrequencyInfo */ nullptr,
732 /* BranchProbabilityInfo */ nullptr,
733 /* AssumptionCache */ nullptr,
734 /* AllowVarArgs */ true,
735 /* AllowAlloca */ true,
736 /* AllocaBlock*/ OI.OuterAllocaBB,
737 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
738
739 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
740 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
741 << " Exit: " << OI.ExitBB->getName() << "\n");
742 assert(Extractor.isEligible() &&
743 "Expected OpenMP outlining to be possible!");
744
745 for (auto *V : OI.ExcludeArgsFromAggregate)
746 Extractor.excludeArgFromAggregate(V);
747
748 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
749
750 // Forward target-cpu, target-features attributes to the outlined function.
751 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
752 if (TargetCpuAttr.isStringAttribute())
753 OutlinedFn->addFnAttr(TargetCpuAttr);
754
755 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
756 if (TargetFeaturesAttr.isStringAttribute())
757 OutlinedFn->addFnAttr(TargetFeaturesAttr);
758
759 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
760 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
761 assert(OutlinedFn->getReturnType()->isVoidTy() &&
762 "OpenMP outlined functions should not return a value!");
763
764 // For compability with the clang CG we move the outlined function after the
765 // one with the parallel region.
766 OutlinedFn->removeFromParent();
767 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
768
769 // Remove the artificial entry introduced by the extractor right away, we
770 // made our own entry block after all.
771 {
772 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
773 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
774 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
775 // Move instructions from the to-be-deleted ArtificialEntry to the entry
776 // basic block of the parallel region. CodeExtractor generates
777 // instructions to unwrap the aggregate argument and may sink
778 // allocas/bitcasts for values that are solely used in the outlined region
779 // and do not escape.
780 assert(!ArtificialEntry.empty() &&
781 "Expected instructions to add in the outlined region entry");
782 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
783 End = ArtificialEntry.rend();
784 It != End;) {
785 Instruction &I = *It;
786 It++;
787
788 if (I.isTerminator()) {
789 // Absorb any debug value that terminator may have
790 if (OI.EntryBB->getTerminator())
791 OI.EntryBB->getTerminator()->adoptDbgRecords(
792 &ArtificialEntry, I.getIterator(), false);
793 continue;
794 }
795
796 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
797 }
798
799 OI.EntryBB->moveBefore(&ArtificialEntry);
800 ArtificialEntry.eraseFromParent();
801 }
802 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
803 assert(OutlinedFn && OutlinedFn->hasNUses(1));
804
805 // Run a user callback, e.g. to add attributes.
806 if (OI.PostOutlineCB)
807 OI.PostOutlineCB(*OutlinedFn);
808 }
809
810 // Remove work items that have been completed.
811 OutlineInfos = std::move(DeferredOutlines);
812
813 // The createTarget functions embeds user written code into
814 // the target region which may inject allocas which need to
815 // be moved to the entry block of our target or risk malformed
816 // optimisations by later passes, this is only relevant for
817 // the device pass which appears to be a little more delicate
818 // when it comes to optimisations (however, we do not block on
819 // that here, it's up to the inserter to the list to do so).
820 // This notbaly has to occur after the OutlinedInfo candidates
821 // have been extracted so we have an end product that will not
822 // be implicitly adversely affected by any raises unless
823 // intentionally appended to the list.
824 // NOTE: This only does so for ConstantData, it could be extended
825 // to ConstantExpr's with further effort, however, they should
826 // largely be folded when they get here. Extending it to runtime
827 // defined/read+writeable allocation sizes would be non-trivial
828 // (need to factor in movement of any stores to variables the
829 // allocation size depends on, as well as the usual loads,
830 // otherwise it'll yield the wrong result after movement) and
831 // likely be more suitable as an LLVM optimisation pass.
832 for (Function *F : ConstantAllocaRaiseCandidates)
834
835 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
836 [](EmitMetadataErrorKind Kind,
837 const TargetRegionEntryInfo &EntryInfo) -> void {
838 errs() << "Error of kind: " << Kind
839 << " when emitting offload entries and metadata during "
840 "OMPIRBuilder finalization \n";
841 };
842
843 if (!OffloadInfoManager.empty())
844 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
845
846 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
847 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
848 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
849 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
850 }
851
852 IsFinalized = true;
853}
854
855bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
856
857OpenMPIRBuilder::~OpenMPIRBuilder() {
858 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
859}
860
861GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
862 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
863 auto *GV =
864 new GlobalVariable(M, I32Ty,
865 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
866 ConstantInt::get(I32Ty, Value), Name);
867 GV->setVisibility(GlobalValue::HiddenVisibility);
868
869 return GV;
870}
871
872void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
873 if (List.empty())
874 return;
875
876 // Convert List to what ConstantArray needs.
878 UsedArray.resize(List.size());
879 for (unsigned I = 0, E = List.size(); I != E; ++I)
881 cast<Constant>(&*List[I]), Builder.getPtrTy());
882
883 if (UsedArray.empty())
884 return;
885 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
886
887 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
888 ConstantArray::get(ATy, UsedArray), Name);
889
890 GV->setSection("llvm.metadata");
891}
892
894OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
896 auto *Int8Ty = Builder.getInt8Ty();
897 auto *GVMode = new GlobalVariable(
898 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
899 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
900 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
901 return GVMode;
902}
903
904Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
905 uint32_t SrcLocStrSize,
906 IdentFlag LocFlags,
907 unsigned Reserve2Flags) {
908 // Enable "C-mode".
909 LocFlags |= OMP_IDENT_FLAG_KMPC;
910
911 Constant *&Ident =
912 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
913 if (!Ident) {
915 Constant *IdentData[] = {I32Null,
916 ConstantInt::get(Int32, uint32_t(LocFlags)),
917 ConstantInt::get(Int32, Reserve2Flags),
918 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
919
920 size_t SrcLocStrArgIdx = 4;
921 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
923 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
924 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
925 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
926 Constant *Initializer =
927 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
928
929 // Look for existing encoding of the location + flags, not needed but
930 // minimizes the difference to the existing solution while we transition.
931 for (GlobalVariable &GV : M.globals())
932 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
933 if (GV.getInitializer() == Initializer)
934 Ident = &GV;
935
936 if (!Ident) {
937 auto *GV = new GlobalVariable(
938 M, OpenMPIRBuilder::Ident,
939 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
941 M.getDataLayout().getDefaultGlobalsAddressSpace());
942 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
943 GV->setAlignment(Align(8));
944 Ident = GV;
945 }
946 }
947
949}
950
951Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
952 uint32_t &SrcLocStrSize) {
953 SrcLocStrSize = LocStr.size();
954 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
955 if (!SrcLocStr) {
956 Constant *Initializer =
957 ConstantDataArray::getString(M.getContext(), LocStr);
958
959 // Look for existing encoding of the location, not needed but minimizes the
960 // difference to the existing solution while we transition.
961 for (GlobalVariable &GV : M.globals())
962 if (GV.isConstant() && GV.hasInitializer() &&
963 GV.getInitializer() == Initializer)
964 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
965
966 SrcLocStr = Builder.CreateGlobalString(
967 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
968 &M);
969 }
970 return SrcLocStr;
971}
972
973Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
974 StringRef FileName,
975 unsigned Line, unsigned Column,
976 uint32_t &SrcLocStrSize) {
977 SmallString<128> Buffer;
978 Buffer.push_back(';');
979 Buffer.append(FileName);
980 Buffer.push_back(';');
981 Buffer.append(FunctionName);
982 Buffer.push_back(';');
983 Buffer.append(std::to_string(Line));
984 Buffer.push_back(';');
985 Buffer.append(std::to_string(Column));
986 Buffer.push_back(';');
987 Buffer.push_back(';');
988 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
989}
990
991Constant *
992OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
993 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
994 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
995}
996
997Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
998 uint32_t &SrcLocStrSize,
999 Function *F) {
1000 DILocation *DIL = DL.get();
1001 if (!DIL)
1002 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1003 StringRef FileName = M.getName();
1004 if (DIFile *DIF = DIL->getFile())
1005 if (std::optional<StringRef> Source = DIF->getSource())
1006 FileName = *Source;
1007 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1008 if (Function.empty() && F)
1009 Function = F->getName();
1010 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1011 DIL->getColumn(), SrcLocStrSize);
1012}
1013
1014Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1015 uint32_t &SrcLocStrSize) {
1016 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1017 Loc.IP.getBlock()->getParent());
1018}
1019
1020Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1021 return Builder.CreateCall(
1022 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1023 "omp_global_thread_num");
1024}
1025
1026OpenMPIRBuilder::InsertPointOrErrorTy
1027OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1028 bool ForceSimpleCall, bool CheckCancelFlag) {
1029 if (!updateToLocation(Loc))
1030 return Loc.IP;
1031
1032 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1033 // __kmpc_barrier(loc, thread_id);
1034
1035 IdentFlag BarrierLocFlags;
1036 switch (Kind) {
1037 case OMPD_for:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1039 break;
1040 case OMPD_sections:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1042 break;
1043 case OMPD_single:
1044 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1045 break;
1046 case OMPD_barrier:
1047 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1048 break;
1049 default:
1050 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1051 break;
1052 }
1053
1054 uint32_t SrcLocStrSize;
1055 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1056 Value *Args[] = {
1057 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1058 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1059
1060 // If we are in a cancellable parallel region, barriers are cancellation
1061 // points.
1062 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1063 bool UseCancelBarrier =
1064 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1065
1066 Value *Result =
1067 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1068 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1069 : OMPRTL___kmpc_barrier),
1070 Args);
1071
1072 if (UseCancelBarrier && CheckCancelFlag)
1073 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1074 return Err;
1075
1076 return Builder.saveIP();
1077}
1078
1079OpenMPIRBuilder::InsertPointOrErrorTy
1080OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1081 Value *IfCondition,
1082 omp::Directive CanceledDirective) {
1083 if (!updateToLocation(Loc))
1084 return Loc.IP;
1085
1086 // LLVM utilities like blocks with terminators.
1087 auto *UI = Builder.CreateUnreachable();
1088
1089 Instruction *ThenTI = UI, *ElseTI = nullptr;
1090 if (IfCondition)
1091 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1092 Builder.SetInsertPoint(ThenTI);
1093
1094 Value *CancelKind = nullptr;
1095 switch (CanceledDirective) {
1096#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1097 case DirectiveEnum: \
1098 CancelKind = Builder.getInt32(Value); \
1099 break;
1100#include "llvm/Frontend/OpenMP/OMPKinds.def"
1101 default:
1102 llvm_unreachable("Unknown cancel kind!");
1103 }
1104
1105 uint32_t SrcLocStrSize;
1106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1108 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1109 Value *Result = Builder.CreateCall(
1110 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1111 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1112 if (CanceledDirective == OMPD_parallel) {
1113 IRBuilder<>::InsertPointGuard IPG(Builder);
1114 Builder.restoreIP(IP);
1115 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1116 omp::Directive::OMPD_unknown,
1117 /* ForceSimpleCall */ false,
1118 /* CheckCancelFlag */ false)
1119 .takeError();
1120 }
1121 return Error::success();
1122 };
1123
1124 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1125 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1126 return Err;
1127
1128 // Update the insertion point and remove the terminator we introduced.
1129 Builder.SetInsertPoint(UI->getParent());
1130 UI->eraseFromParent();
1131
1132 return Builder.saveIP();
1133}
1134
1135OpenMPIRBuilder::InsertPointOrErrorTy
1136OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1137 omp::Directive CanceledDirective) {
1138 if (!updateToLocation(Loc))
1139 return Loc.IP;
1140
1141 // LLVM utilities like blocks with terminators.
1142 auto *UI = Builder.CreateUnreachable();
1143 Builder.SetInsertPoint(UI);
1144
1145 Value *CancelKind = nullptr;
1146 switch (CanceledDirective) {
1147#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1148 case DirectiveEnum: \
1149 CancelKind = Builder.getInt32(Value); \
1150 break;
1151#include "llvm/Frontend/OpenMP/OMPKinds.def"
1152 default:
1153 llvm_unreachable("Unknown cancel kind!");
1154 }
1155
1156 uint32_t SrcLocStrSize;
1157 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1159 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1160 Value *Result = Builder.CreateCall(
1161 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1162 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1163 if (CanceledDirective == OMPD_parallel) {
1164 IRBuilder<>::InsertPointGuard IPG(Builder);
1165 Builder.restoreIP(IP);
1166 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1167 omp::Directive::OMPD_unknown,
1168 /* ForceSimpleCall */ false,
1169 /* CheckCancelFlag */ false)
1170 .takeError();
1171 }
1172 return Error::success();
1173 };
1174
1175 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1176 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1177 return Err;
1178
1179 // Update the insertion point and remove the terminator we introduced.
1180 Builder.SetInsertPoint(UI->getParent());
1181 UI->eraseFromParent();
1182
1183 return Builder.saveIP();
1184}
1185
1186OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1187 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1188 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1189 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1190 if (!updateToLocation(Loc))
1191 return Loc.IP;
1192
1193 Builder.restoreIP(AllocaIP);
1194 auto *KernelArgsPtr =
1195 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1196 updateToLocation(Loc);
1197
1198 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1199 llvm::Value *Arg =
1200 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1201 Builder.CreateAlignedStore(
1202 KernelArgs[I], Arg,
1203 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1204 }
1205
1206 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1207 NumThreads, HostPtr, KernelArgsPtr};
1208
1209 Return = Builder.CreateCall(
1210 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1211 OffloadingArgs);
1212
1213 return Builder.saveIP();
1214}
1215
1216OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1217 const LocationDescription &Loc, Value *OutlinedFnID,
1218 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1219 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1220
1221 if (!updateToLocation(Loc))
1222 return Loc.IP;
1223
1224 // On top of the arrays that were filled up, the target offloading call
1225 // takes as arguments the device id as well as the host pointer. The host
1226 // pointer is used by the runtime library to identify the current target
1227 // region, so it only has to be unique and not necessarily point to
1228 // anything. It could be the pointer to the outlined function that
1229 // implements the target region, but we aren't using that so that the
1230 // compiler doesn't need to keep that, and could therefore inline the host
1231 // function if proven worthwhile during optimization.
1232
1233 // From this point on, we need to have an ID of the target region defined.
1234 assert(OutlinedFnID && "Invalid outlined function ID!");
1235 (void)OutlinedFnID;
1236
1237 // Return value of the runtime offloading call.
1238 Value *Return = nullptr;
1239
1240 // Arguments for the target kernel.
1241 SmallVector<Value *> ArgsVector;
1242 getKernelArgsVector(Args, Builder, ArgsVector);
1243
1244 // The target region is an outlined function launched by the runtime
1245 // via calls to __tgt_target_kernel().
1246 //
1247 // Note that on the host and CPU targets, the runtime implementation of
1248 // these calls simply call the outlined function without forking threads.
1249 // The outlined functions themselves have runtime calls to
1250 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1251 // the compiler in emitTeamsCall() and emitParallelCall().
1252 //
1253 // In contrast, on the NVPTX target, the implementation of
1254 // __tgt_target_teams() launches a GPU kernel with the requested number
1255 // of teams and threads so no additional calls to the runtime are required.
1256 // Check the error code and execute the host version if required.
1257 Builder.restoreIP(emitTargetKernel(
1258 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1259 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1260
1261 BasicBlock *OffloadFailedBlock =
1262 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1263 BasicBlock *OffloadContBlock =
1264 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1265 Value *Failed = Builder.CreateIsNotNull(Return);
1266 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1267
1268 auto CurFn = Builder.GetInsertBlock()->getParent();
1269 emitBlock(OffloadFailedBlock, CurFn);
1270 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1271 if (!AfterIP)
1272 return AfterIP.takeError();
1273 Builder.restoreIP(*AfterIP);
1274 emitBranch(OffloadContBlock);
1275 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1276 return Builder.saveIP();
1277}
1278
1279Error OpenMPIRBuilder::emitCancelationCheckImpl(
1280 Value *CancelFlag, omp::Directive CanceledDirective,
1281 FinalizeCallbackTy ExitCB) {
1282 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1283 "Unexpected cancellation!");
1284
1285 // For a cancel barrier we create two new blocks.
1286 BasicBlock *BB = Builder.GetInsertBlock();
1287 BasicBlock *NonCancellationBlock;
1288 if (Builder.GetInsertPoint() == BB->end()) {
1289 // TODO: This branch will not be needed once we moved to the
1290 // OpenMPIRBuilder codegen completely.
1291 NonCancellationBlock = BasicBlock::Create(
1292 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1293 } else {
1294 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1296 Builder.SetInsertPoint(BB);
1297 }
1298 BasicBlock *CancellationBlock = BasicBlock::Create(
1299 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1300
1301 // Jump to them based on the return value.
1302 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1303 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1304 /* TODO weight */ nullptr, nullptr);
1305
1306 // From the cancellation block we finalize all variables and go to the
1307 // post finalization block that is known to the FiniCB callback.
1308 Builder.SetInsertPoint(CancellationBlock);
1309 if (ExitCB)
1310 if (Error Err = ExitCB(Builder.saveIP()))
1311 return Err;
1312 auto &FI = FinalizationStack.back();
1313 if (Error Err = FI.FiniCB(Builder.saveIP()))
1314 return Err;
1315
1316 // The continuation block is where code generation continues.
1317 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1318 return Error::success();
1319}
1320
1321// Callback used to create OpenMP runtime calls to support
1322// omp parallel clause for the device.
1323// We need to use this callback to replace call to the OutlinedFn in OuterFn
1324// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1326 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1327 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1328 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1329 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1330 // Add some known attributes.
1331 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1332 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1333 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1334 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1335 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1336 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1337
1338 assert(OutlinedFn.arg_size() >= 2 &&
1339 "Expected at least tid and bounded tid as arguments");
1340 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1341
1342 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1343 assert(CI && "Expected call instruction to outlined function");
1344 CI->getParent()->setName("omp_parallel");
1345
1346 Builder.SetInsertPoint(CI);
1347 Type *PtrTy = OMPIRBuilder->VoidPtr;
1348 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1349
1350 // Add alloca for kernel args
1351 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1352 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1353 AllocaInst *ArgsAlloca =
1354 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1355 Value *Args = ArgsAlloca;
1356 // Add address space cast if array for storing arguments is not allocated
1357 // in address space 0
1358 if (ArgsAlloca->getAddressSpace())
1359 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1360 Builder.restoreIP(CurrentIP);
1361
1362 // Store captured vars which are used by kmpc_parallel_51
1363 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1364 Value *V = *(CI->arg_begin() + 2 + Idx);
1365 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1366 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1367 Builder.CreateStore(V, StoreAddress);
1368 }
1369
1370 Value *Cond =
1371 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1372 : Builder.getInt32(1);
1373
1374 // Build kmpc_parallel_51 call
1375 Value *Parallel51CallArgs[] = {
1376 /* identifier*/ Ident,
1377 /* global thread num*/ ThreadID,
1378 /* if expression */ Cond,
1379 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1380 /* Proc bind */ Builder.getInt32(-1),
1381 /* outlined function */ &OutlinedFn,
1382 /* wrapper function */ NullPtrValue,
1383 /* arguments of the outlined funciton*/ Args,
1384 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1385
1386 FunctionCallee RTLFn =
1387 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1388
1389 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1390
1391 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1392 << *Builder.GetInsertBlock()->getParent() << "\n");
1393
1394 // Initialize the local TID stack location with the argument value.
1395 Builder.SetInsertPoint(PrivTID);
1396 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1397 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1398 PrivTIDAddr);
1399
1400 // Remove redundant call to the outlined function.
1401 CI->eraseFromParent();
1402
1403 for (Instruction *I : ToBeDeleted) {
1404 I->eraseFromParent();
1405 }
1406}
1407
1408// Callback used to create OpenMP runtime calls to support
1409// omp parallel clause for the host.
1410// We need to use this callback to replace call to the OutlinedFn in OuterFn
1411// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1412static void
1413hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1414 Function *OuterFn, Value *Ident, Value *IfCondition,
1415 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1416 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1417 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1418 FunctionCallee RTLFn;
1419 if (IfCondition) {
1420 RTLFn =
1421 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1422 } else {
1423 RTLFn =
1424 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1425 }
1426 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1427 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1428 LLVMContext &Ctx = F->getContext();
1429 MDBuilder MDB(Ctx);
1430 // Annotate the callback behavior of the __kmpc_fork_call:
1431 // - The callback callee is argument number 2 (microtask).
1432 // - The first two arguments of the callback callee are unknown (-1).
1433 // - All variadic arguments to the __kmpc_fork_call are passed to the
1434 // callback callee.
1435 F->addMetadata(LLVMContext::MD_callback,
1437 2, {-1, -1},
1438 /* VarArgsArePassed */ true)}));
1439 }
1440 }
1441 // Add some known attributes.
1442 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1443 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1444 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1445
1446 assert(OutlinedFn.arg_size() >= 2 &&
1447 "Expected at least tid and bounded tid as arguments");
1448 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1449
1450 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1451 CI->getParent()->setName("omp_parallel");
1452 Builder.SetInsertPoint(CI);
1453
1454 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1455 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1456 &OutlinedFn};
1457
1458 SmallVector<Value *, 16> RealArgs;
1459 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1460 if (IfCondition) {
1461 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1462 RealArgs.push_back(Cond);
1463 }
1464 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1465
1466 // __kmpc_fork_call_if always expects a void ptr as the last argument
1467 // If there are no arguments, pass a null pointer.
1468 auto PtrTy = OMPIRBuilder->VoidPtr;
1469 if (IfCondition && NumCapturedVars == 0) {
1470 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1471 RealArgs.push_back(NullPtrValue);
1472 }
1473
1474 Builder.CreateCall(RTLFn, RealArgs);
1475
1476 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1477 << *Builder.GetInsertBlock()->getParent() << "\n");
1478
1479 // Initialize the local TID stack location with the argument value.
1480 Builder.SetInsertPoint(PrivTID);
1481 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1482 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1483 PrivTIDAddr);
1484
1485 // Remove redundant call to the outlined function.
1486 CI->eraseFromParent();
1487
1488 for (Instruction *I : ToBeDeleted) {
1489 I->eraseFromParent();
1490 }
1491}
1492
1493OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1494 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1495 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1496 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1497 omp::ProcBindKind ProcBind, bool IsCancellable) {
1498 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1499
1500 if (!updateToLocation(Loc))
1501 return Loc.IP;
1502
1503 uint32_t SrcLocStrSize;
1504 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1505 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1506 Value *ThreadID = getOrCreateThreadID(Ident);
1507 // If we generate code for the target device, we need to allocate
1508 // struct for aggregate params in the device default alloca address space.
1509 // OpenMP runtime requires that the params of the extracted functions are
1510 // passed as zero address space pointers. This flag ensures that extracted
1511 // function arguments are declared in zero address space
1512 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1513
1514 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1515 // only if we compile for host side.
1516 if (NumThreads && !Config.isTargetDevice()) {
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1520 Builder.CreateCall(
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1522 }
1523
1524 if (ProcBind != OMP_PROC_BIND_default) {
1525 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1526 Value *Args[] = {
1527 Ident, ThreadID,
1528 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1529 Builder.CreateCall(
1530 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1531 }
1532
1533 BasicBlock *InsertBB = Builder.GetInsertBlock();
1534 Function *OuterFn = InsertBB->getParent();
1535
1536 // Save the outer alloca block because the insertion iterator may get
1537 // invalidated and we still need this later.
1538 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1539
1540 // Vector to remember instructions we used only during the modeling but which
1541 // we want to delete at the end.
1543
1544 // Change the location to the outer alloca insertion point to create and
1545 // initialize the allocas we pass into the parallel region.
1546 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1547 Builder.restoreIP(NewOuter);
1548 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1549 AllocaInst *ZeroAddrAlloca =
1550 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1551 Instruction *TIDAddr = TIDAddrAlloca;
1552 Instruction *ZeroAddr = ZeroAddrAlloca;
1553 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1554 // Add additional casts to enforce pointers in zero address space
1555 TIDAddr = new AddrSpaceCastInst(
1556 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1557 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1558 ToBeDeleted.push_back(TIDAddr);
1559 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1560 PointerType ::get(M.getContext(), 0),
1561 "zero.addr.ascast");
1562 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1563 ToBeDeleted.push_back(ZeroAddr);
1564 }
1565
1566 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1567 // associated arguments in the outlined function, so we delete them later.
1568 ToBeDeleted.push_back(TIDAddrAlloca);
1569 ToBeDeleted.push_back(ZeroAddrAlloca);
1570
1571 // Create an artificial insertion point that will also ensure the blocks we
1572 // are about to split are not degenerated.
1573 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1574
1575 BasicBlock *EntryBB = UI->getParent();
1576 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1577 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1578 BasicBlock *PRegPreFiniBB =
1579 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1580 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1581
1582 auto FiniCBWrapper = [&](InsertPointTy IP) {
1583 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1584 // target to the region exit block.
1585 if (IP.getBlock()->end() == IP.getPoint()) {
1586 IRBuilder<>::InsertPointGuard IPG(Builder);
1587 Builder.restoreIP(IP);
1588 Instruction *I = Builder.CreateBr(PRegExitBB);
1589 IP = InsertPointTy(I->getParent(), I->getIterator());
1590 }
1592 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1593 "Unexpected insertion point for finalization call!");
1594 return FiniCB(IP);
1595 };
1596
1597 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1598
1599 // Generate the privatization allocas in the block that will become the entry
1600 // of the outlined function.
1601 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1602 InsertPointTy InnerAllocaIP = Builder.saveIP();
1603
1604 AllocaInst *PrivTIDAddr =
1605 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1606 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1607
1608 // Add some fake uses for OpenMP provided arguments.
1609 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1610 Instruction *ZeroAddrUse =
1611 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1612 ToBeDeleted.push_back(ZeroAddrUse);
1613
1614 // EntryBB
1615 // |
1616 // V
1617 // PRegionEntryBB <- Privatization allocas are placed here.
1618 // |
1619 // V
1620 // PRegionBodyBB <- BodeGen is invoked here.
1621 // |
1622 // V
1623 // PRegPreFiniBB <- The block we will start finalization from.
1624 // |
1625 // V
1626 // PRegionExitBB <- A common exit to simplify block collection.
1627 //
1628
1629 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1630
1631 // Let the caller create the body.
1632 assert(BodyGenCB && "Expected body generation callback!");
1633 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1634 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1635 return Err;
1636
1637 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1638
1639 OutlineInfo OI;
1640 if (Config.isTargetDevice()) {
1641 // Generate OpenMP target specific runtime call
1642 OI.PostOutlineCB = [=, ToBeDeletedVec =
1643 std::move(ToBeDeleted)](Function &OutlinedFn) {
1644 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1645 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1646 ThreadID, ToBeDeletedVec);
1647 };
1648 } else {
1649 // Generate OpenMP host runtime call
1650 OI.PostOutlineCB = [=, ToBeDeletedVec =
1651 std::move(ToBeDeleted)](Function &OutlinedFn) {
1652 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1653 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1654 };
1655 }
1656
1657 OI.OuterAllocaBB = OuterAllocaBlock;
1658 OI.EntryBB = PRegEntryBB;
1659 OI.ExitBB = PRegExitBB;
1660
1661 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1663 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1664
1665 CodeExtractorAnalysisCache CEAC(*OuterFn);
1666 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1667 /* AggregateArgs */ false,
1668 /* BlockFrequencyInfo */ nullptr,
1669 /* BranchProbabilityInfo */ nullptr,
1670 /* AssumptionCache */ nullptr,
1671 /* AllowVarArgs */ true,
1672 /* AllowAlloca */ true,
1673 /* AllocationBlock */ OuterAllocaBlock,
1674 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1675
1676 // Find inputs to, outputs from the code region.
1677 BasicBlock *CommonExit = nullptr;
1678 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1679 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1680
1681 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1682 /*CollectGlobalInputs=*/true);
1683
1684 Inputs.remove_if([&](Value *I) {
1686 return GV->getValueType() == OpenMPIRBuilder::Ident;
1687
1688 return false;
1689 });
1690
1691 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1692
1693 FunctionCallee TIDRTLFn =
1694 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1695
1696 auto PrivHelper = [&](Value &V) -> Error {
1697 if (&V == TIDAddr || &V == ZeroAddr) {
1698 OI.ExcludeArgsFromAggregate.push_back(&V);
1699 return Error::success();
1700 }
1701
1703 for (Use &U : V.uses())
1704 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1705 if (ParallelRegionBlockSet.count(UserI->getParent()))
1706 Uses.insert(&U);
1707
1708 // __kmpc_fork_call expects extra arguments as pointers. If the input
1709 // already has a pointer type, everything is fine. Otherwise, store the
1710 // value onto stack and load it back inside the to-be-outlined region. This
1711 // will ensure only the pointer will be passed to the function.
1712 // FIXME: if there are more than 15 trailing arguments, they must be
1713 // additionally packed in a struct.
1714 Value *Inner = &V;
1715 if (!V.getType()->isPointerTy()) {
1716 IRBuilder<>::InsertPointGuard Guard(Builder);
1717 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1718
1719 Builder.restoreIP(OuterAllocaIP);
1720 Value *Ptr =
1721 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1722
1723 // Store to stack at end of the block that currently branches to the entry
1724 // block of the to-be-outlined region.
1725 Builder.SetInsertPoint(InsertBB,
1726 InsertBB->getTerminator()->getIterator());
1727 Builder.CreateStore(&V, Ptr);
1728
1729 // Load back next to allocations in the to-be-outlined region.
1730 Builder.restoreIP(InnerAllocaIP);
1731 Inner = Builder.CreateLoad(V.getType(), Ptr);
1732 }
1733
1734 Value *ReplacementValue = nullptr;
1735 CallInst *CI = dyn_cast<CallInst>(&V);
1736 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1737 ReplacementValue = PrivTID;
1738 } else {
1739 InsertPointOrErrorTy AfterIP =
1740 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1741 if (!AfterIP)
1742 return AfterIP.takeError();
1743 Builder.restoreIP(*AfterIP);
1744 InnerAllocaIP = {
1745 InnerAllocaIP.getBlock(),
1746 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1747
1748 assert(ReplacementValue &&
1749 "Expected copy/create callback to set replacement value!");
1750 if (ReplacementValue == &V)
1751 return Error::success();
1752 }
1753
1754 for (Use *UPtr : Uses)
1755 UPtr->set(ReplacementValue);
1756
1757 return Error::success();
1758 };
1759
1760 // Reset the inner alloca insertion as it will be used for loading the values
1761 // wrapped into pointers before passing them into the to-be-outlined region.
1762 // Configure it to insert immediately after the fake use of zero address so
1763 // that they are available in the generated body and so that the
1764 // OpenMP-related values (thread ID and zero address pointers) remain leading
1765 // in the argument list.
1766 InnerAllocaIP = IRBuilder<>::InsertPoint(
1767 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1768
1769 // Reset the outer alloca insertion point to the entry of the relevant block
1770 // in case it was invalidated.
1771 OuterAllocaIP = IRBuilder<>::InsertPoint(
1772 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1773
1774 for (Value *Input : Inputs) {
1775 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1776 if (Error Err = PrivHelper(*Input))
1777 return Err;
1778 }
1779 LLVM_DEBUG({
1780 for (Value *Output : Outputs)
1781 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1782 });
1783 assert(Outputs.empty() &&
1784 "OpenMP outlining should not produce live-out values!");
1785
1786 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1787 LLVM_DEBUG({
1788 for (auto *BB : Blocks)
1789 dbgs() << " PBR: " << BB->getName() << "\n";
1790 });
1791
1792 // Adjust the finalization stack, verify the adjustment, and call the
1793 // finalize function a last time to finalize values between the pre-fini
1794 // block and the exit block if we left the parallel "the normal way".
1795 auto FiniInfo = FinalizationStack.pop_back_val();
1796 (void)FiniInfo;
1797 assert(FiniInfo.DK == OMPD_parallel &&
1798 "Unexpected finalization stack state!");
1799
1800 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1801
1802 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1803 if (Error Err = FiniCB(PreFiniIP))
1804 return Err;
1805
1806 // Register the outlined info.
1807 addOutlineInfo(std::move(OI));
1808
1809 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1810 UI->eraseFromParent();
1811
1812 return AfterIP;
1813}
1814
1815void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1816 // Build call void __kmpc_flush(ident_t *loc)
1817 uint32_t SrcLocStrSize;
1818 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1819 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1820
1821 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1822}
1823
1824void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1825 if (!updateToLocation(Loc))
1826 return;
1827 emitFlush(Loc);
1828}
1829
1830void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1831 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1832 // global_tid);
1833 uint32_t SrcLocStrSize;
1834 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1835 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1836 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1837
1838 // Ignore return result until untied tasks are supported.
1839 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1840 Args);
1841}
1842
1843void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1844 if (!updateToLocation(Loc))
1845 return;
1846 emitTaskwaitImpl(Loc);
1847}
1848
1849void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1850 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1851 uint32_t SrcLocStrSize;
1852 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1853 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1855 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1856
1857 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1858 Args);
1859}
1860
1861void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1862 if (!updateToLocation(Loc))
1863 return;
1864 emitTaskyieldImpl(Loc);
1865}
1866
1867// Processes the dependencies in Dependencies and does the following
1868// - Allocates space on the stack of an array of DependInfo objects
1869// - Populates each DependInfo object with relevant information of
1870// the corresponding dependence.
1871// - All code is inserted in the entry block of the current function.
1873 OpenMPIRBuilder &OMPBuilder,
1875 // Early return if we have no dependencies to process
1876 if (Dependencies.empty())
1877 return nullptr;
1878
1879 // Given a vector of DependData objects, in this function we create an
1880 // array on the stack that holds kmp_dep_info objects corresponding
1881 // to each dependency. This is then passed to the OpenMP runtime.
1882 // For example, if there are 'n' dependencies then the following psedo
1883 // code is generated. Assume the first dependence is on a variable 'a'
1884 //
1885 // \code{c}
1886 // DepArray = alloc(n x sizeof(kmp_depend_info);
1887 // idx = 0;
1888 // DepArray[idx].base_addr = ptrtoint(&a);
1889 // DepArray[idx].len = 8;
1890 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1891 // ++idx;
1892 // DepArray[idx].base_addr = ...;
1893 // \endcode
1894
1895 IRBuilderBase &Builder = OMPBuilder.Builder;
1896 Type *DependInfo = OMPBuilder.DependInfo;
1897 Module &M = OMPBuilder.M;
1898
1899 Value *DepArray = nullptr;
1900 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1901 Builder.SetInsertPoint(
1902 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1903
1904 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1905 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1906
1907 Builder.restoreIP(OldIP);
1908
1909 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1910 Value *Base =
1911 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1912 // Store the pointer to the variable
1913 Value *Addr = Builder.CreateStructGEP(
1914 DependInfo, Base,
1915 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1916 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1917 Builder.CreateStore(DepValPtr, Addr);
1918 // Store the size of the variable
1919 Value *Size = Builder.CreateStructGEP(
1920 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1921 Builder.CreateStore(
1922 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1923 Size);
1924 // Store the dependency kind
1925 Value *Flags = Builder.CreateStructGEP(
1926 DependInfo, Base,
1927 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1928 Builder.CreateStore(
1929 ConstantInt::get(Builder.getInt8Ty(),
1930 static_cast<unsigned int>(Dep.DepKind)),
1931 Flags);
1932 }
1933 return DepArray;
1934}
1935
1936OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1937 const LocationDescription &Loc, InsertPointTy AllocaIP,
1938 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1939 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1940 Value *Priority) {
1941
1942 if (!updateToLocation(Loc))
1943 return InsertPointTy();
1944
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 // The current basic block is split into four basic blocks. After outlining,
1949 // they will be mapped as follows:
1950 // ```
1951 // def current_fn() {
1952 // current_basic_block:
1953 // br label %task.exit
1954 // task.exit:
1955 // ; instructions after task
1956 // }
1957 // def outlined_fn() {
1958 // task.alloca:
1959 // br label %task.body
1960 // task.body:
1961 // ret void
1962 // }
1963 // ```
1964 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1965 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1966 BasicBlock *TaskAllocaBB =
1967 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1968
1969 InsertPointTy TaskAllocaIP =
1970 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1971 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1972 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1973 return Err;
1974
1975 OutlineInfo OI;
1976 OI.EntryBB = TaskAllocaBB;
1977 OI.OuterAllocaBB = AllocaIP.getBlock();
1978 OI.ExitBB = TaskExitBB;
1979
1980 // Add the thread ID argument.
1982 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1983 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1984
1985 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1986 Mergeable, Priority, EventHandle, TaskAllocaBB,
1987 ToBeDeleted](Function &OutlinedFn) mutable {
1988 // Replace the Stale CI by appropriate RTL function call.
1989 assert(OutlinedFn.hasOneUse() &&
1990 "there must be a single user for the outlined function");
1991 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1992
1993 // HasShareds is true if any variables are captured in the outlined region,
1994 // false otherwise.
1995 bool HasShareds = StaleCI->arg_size() > 1;
1996 Builder.SetInsertPoint(StaleCI);
1997
1998 // Gather the arguments for emitting the runtime call for
1999 // @__kmpc_omp_task_alloc
2000 Function *TaskAllocFn =
2001 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2002
2003 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2004 // call.
2005 Value *ThreadID = getOrCreateThreadID(Ident);
2006
2007 // Argument - `flags`
2008 // Task is tied iff (Flags & 1) == 1.
2009 // Task is untied iff (Flags & 1) == 0.
2010 // Task is final iff (Flags & 2) == 2.
2011 // Task is not final iff (Flags & 2) == 0.
2012 // Task is mergeable iff (Flags & 4) == 4.
2013 // Task is not mergeable iff (Flags & 4) == 0.
2014 // Task is priority iff (Flags & 32) == 32.
2015 // Task is not priority iff (Flags & 32) == 0.
2016 // TODO: Handle the other flags.
2017 Value *Flags = Builder.getInt32(Tied);
2018 if (Final) {
2019 Value *FinalFlag =
2020 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2021 Flags = Builder.CreateOr(FinalFlag, Flags);
2022 }
2023
2024 if (Mergeable)
2025 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2026 if (Priority)
2027 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2028
2029 // Argument - `sizeof_kmp_task_t` (TaskSize)
2030 // Tasksize refers to the size in bytes of kmp_task_t data structure
2031 // including private vars accessed in task.
2032 // TODO: add kmp_task_t_with_privates (privates)
2033 Value *TaskSize = Builder.getInt64(
2034 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2035
2036 // Argument - `sizeof_shareds` (SharedsSize)
2037 // SharedsSize refers to the shareds array size in the kmp_task_t data
2038 // structure.
2039 Value *SharedsSize = Builder.getInt64(0);
2040 if (HasShareds) {
2041 AllocaInst *ArgStructAlloca =
2043 assert(ArgStructAlloca &&
2044 "Unable to find the alloca instruction corresponding to arguments "
2045 "for extracted function");
2046 StructType *ArgStructType =
2047 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2048 assert(ArgStructType && "Unable to find struct type corresponding to "
2049 "arguments for extracted function");
2050 SharedsSize =
2051 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2052 }
2053 // Emit the @__kmpc_omp_task_alloc runtime call
2054 // The runtime call returns a pointer to an area where the task captured
2055 // variables must be copied before the task is run (TaskData)
2056 CallInst *TaskData = Builder.CreateCall(
2057 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2058 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2059 /*task_func=*/&OutlinedFn});
2060
2061 // Emit detach clause initialization.
2062 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2063 // task_descriptor);
2064 if (EventHandle) {
2065 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2066 OMPRTL___kmpc_task_allow_completion_event);
2067 llvm::Value *EventVal =
2068 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2069 llvm::Value *EventHandleAddr =
2070 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2071 Builder.getPtrTy(0));
2072 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2073 Builder.CreateStore(EventVal, EventHandleAddr);
2074 }
2075 // Copy the arguments for outlined function
2076 if (HasShareds) {
2077 Value *Shareds = StaleCI->getArgOperand(1);
2078 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2079 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2080 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2081 SharedsSize);
2082 }
2083
2084 if (Priority) {
2085 //
2086 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2087 // we populate the priority information into the "kmp_task_t" here
2088 //
2089 // The struct "kmp_task_t" definition is available in kmp.h
2090 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2091 // data2 is used for priority
2092 //
2093 Type *Int32Ty = Builder.getInt32Ty();
2094 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2095 // kmp_task_t* => { ptr }
2096 Type *TaskPtr = StructType::get(VoidPtr);
2097 Value *TaskGEP =
2098 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2099 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2100 Type *TaskStructType = StructType::get(
2101 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2102 Value *PriorityData = Builder.CreateInBoundsGEP(
2103 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2104 // kmp_cmplrdata_t => { ptr, ptr }
2105 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2106 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2107 PriorityData, {Zero, Zero});
2108 Builder.CreateStore(Priority, CmplrData);
2109 }
2110
2111 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2112
2113 // In the presence of the `if` clause, the following IR is generated:
2114 // ...
2115 // %data = call @__kmpc_omp_task_alloc(...)
2116 // br i1 %if_condition, label %then, label %else
2117 // then:
2118 // call @__kmpc_omp_task(...)
2119 // br label %exit
2120 // else:
2121 // ;; Wait for resolution of dependencies, if any, before
2122 // ;; beginning the task
2123 // call @__kmpc_omp_wait_deps(...)
2124 // call @__kmpc_omp_task_begin_if0(...)
2125 // call @outlined_fn(...)
2126 // call @__kmpc_omp_task_complete_if0(...)
2127 // br label %exit
2128 // exit:
2129 // ...
2130 if (IfCondition) {
2131 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2132 // terminator.
2133 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2134 Instruction *IfTerminator =
2135 Builder.GetInsertPoint()->getParent()->getTerminator();
2136 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2137 Builder.SetInsertPoint(IfTerminator);
2138 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2139 &ElseTI);
2140 Builder.SetInsertPoint(ElseTI);
2141
2142 if (Dependencies.size()) {
2143 Function *TaskWaitFn =
2144 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2145 Builder.CreateCall(
2146 TaskWaitFn,
2147 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2148 ConstantInt::get(Builder.getInt32Ty(), 0),
2150 }
2151 Function *TaskBeginFn =
2152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2153 Function *TaskCompleteFn =
2154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2155 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2156 CallInst *CI = nullptr;
2157 if (HasShareds)
2158 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2159 else
2160 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2161 CI->setDebugLoc(StaleCI->getDebugLoc());
2162 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2163 Builder.SetInsertPoint(ThenTI);
2164 }
2165
2166 if (Dependencies.size()) {
2167 Function *TaskFn =
2168 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2169 Builder.CreateCall(
2170 TaskFn,
2171 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2172 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2174
2175 } else {
2176 // Emit the @__kmpc_omp_task runtime call to spawn the task
2177 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2178 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2179 }
2180
2181 StaleCI->eraseFromParent();
2182
2183 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2184 if (HasShareds) {
2185 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2186 OutlinedFn.getArg(1)->replaceUsesWithIf(
2187 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2188 }
2189
2190 for (Instruction *I : llvm::reverse(ToBeDeleted))
2191 I->eraseFromParent();
2192 };
2193
2194 addOutlineInfo(std::move(OI));
2195 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2196
2197 return Builder.saveIP();
2198}
2199
2200OpenMPIRBuilder::InsertPointOrErrorTy
2201OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2202 InsertPointTy AllocaIP,
2203 BodyGenCallbackTy BodyGenCB) {
2204 if (!updateToLocation(Loc))
2205 return InsertPointTy();
2206
2207 uint32_t SrcLocStrSize;
2208 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2209 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2210 Value *ThreadID = getOrCreateThreadID(Ident);
2211
2212 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2213 Function *TaskgroupFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2215 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2216
2217 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2218 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2219 return Err;
2220
2221 Builder.SetInsertPoint(TaskgroupExitBB);
2222 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2223 Function *EndTaskgroupFn =
2224 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2225 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2226
2227 return Builder.saveIP();
2228}
2229
2230OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2231 const LocationDescription &Loc, InsertPointTy AllocaIP,
2232 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2233 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2234 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2235
2236 if (!updateToLocation(Loc))
2237 return Loc.IP;
2238
2239 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2240 // this has not been created yet at some times when this callback runs.
2241 SmallVector<BranchInst *> CancellationBranches;
2242 auto FiniCBWrapper = [&](InsertPointTy IP) {
2243 if (IP.getBlock()->end() != IP.getPoint())
2244 return FiniCB(IP);
2245 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2246 // will fail because that function requires the Finalization Basic Block to
2247 // have a terminator, which is already removed by EmitOMPRegionBody.
2248 // IP is currently at cancelation block.
2249 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2250 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2251 CancellationBranches.push_back(DummyBranch);
2252 return FiniCB(IP);
2253 };
2254
2255 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2256
2257 // Each section is emitted as a switch case
2258 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2259 // -> OMP.createSection() which generates the IR for each section
2260 // Iterate through all sections and emit a switch construct:
2261 // switch (IV) {
2262 // case 0:
2263 // <SectionStmt[0]>;
2264 // break;
2265 // ...
2266 // case <NumSection> - 1:
2267 // <SectionStmt[<NumSection> - 1]>;
2268 // break;
2269 // }
2270 // ...
2271 // section_loop.after:
2272 // <FiniCB>;
2273 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2274 Builder.restoreIP(CodeGenIP);
2276 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2277 Function *CurFn = Continue->getParent();
2278 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2279
2280 unsigned CaseNumber = 0;
2281 for (auto SectionCB : SectionCBs) {
2283 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2284 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2285 Builder.SetInsertPoint(CaseBB);
2286 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2287 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2288 CaseEndBr->getIterator()}))
2289 return Err;
2290 CaseNumber++;
2291 }
2292 // remove the existing terminator from body BB since there can be no
2293 // terminators after switch/case
2294 return Error::success();
2295 };
2296 // Loop body ends here
2297 // LowerBound, UpperBound, and STride for createCanonicalLoop
2298 Type *I32Ty = Type::getInt32Ty(M.getContext());
2299 Value *LB = ConstantInt::get(I32Ty, 0);
2300 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2301 Value *ST = ConstantInt::get(I32Ty, 1);
2302 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2303 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2304 if (!LoopInfo)
2305 return LoopInfo.takeError();
2306
2307 InsertPointOrErrorTy WsloopIP =
2308 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2309 WorksharingLoopType::ForStaticLoop, !IsNowait);
2310 if (!WsloopIP)
2311 return WsloopIP.takeError();
2312 InsertPointTy AfterIP = *WsloopIP;
2313
2314 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2315 assert(LoopFini && "Bad structure of static workshare loop finalization");
2316
2317 // Apply the finalization callback in LoopAfterBB
2318 auto FiniInfo = FinalizationStack.pop_back_val();
2319 assert(FiniInfo.DK == OMPD_sections &&
2320 "Unexpected finalization stack state!");
2321 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2322 Builder.restoreIP(AfterIP);
2323 BasicBlock *FiniBB =
2324 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2325 if (Error Err = CB(Builder.saveIP()))
2326 return Err;
2327 AfterIP = {FiniBB, FiniBB->begin()};
2328 }
2329
2330 // Now we can fix the dummy branch to point to the right place
2331 for (BranchInst *DummyBranch : CancellationBranches) {
2332 assert(DummyBranch->getNumSuccessors() == 1);
2333 DummyBranch->setSuccessor(0, LoopFini);
2334 }
2335
2336 return AfterIP;
2337}
2338
2339OpenMPIRBuilder::InsertPointOrErrorTy
2340OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2341 BodyGenCallbackTy BodyGenCB,
2342 FinalizeCallbackTy FiniCB) {
2343 if (!updateToLocation(Loc))
2344 return Loc.IP;
2345
2346 auto FiniCBWrapper = [&](InsertPointTy IP) {
2347 if (IP.getBlock()->end() != IP.getPoint())
2348 return FiniCB(IP);
2349 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2350 // will fail because that function requires the Finalization Basic Block to
2351 // have a terminator, which is already removed by EmitOMPRegionBody.
2352 // IP is currently at cancelation block.
2353 // We need to backtrack to the condition block to fetch
2354 // the exit block and create a branch from cancelation
2355 // to exit block.
2356 IRBuilder<>::InsertPointGuard IPG(Builder);
2357 Builder.restoreIP(IP);
2358 auto *CaseBB = Loc.IP.getBlock();
2359 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2360 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2361 Instruction *I = Builder.CreateBr(ExitBB);
2362 IP = InsertPointTy(I->getParent(), I->getIterator());
2363 return FiniCB(IP);
2364 };
2365
2366 Directive OMPD = Directive::OMPD_sections;
2367 // Since we are using Finalization Callback here, HasFinalize
2368 // and IsCancellable have to be true
2369 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2370 /*Conditional*/ false, /*hasFinalize*/ true,
2371 /*IsCancellable*/ true);
2372}
2373
2374static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2376 IT++;
2377 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2378}
2379
2380Value *OpenMPIRBuilder::getGPUThreadID() {
2381 return Builder.CreateCall(
2382 getOrCreateRuntimeFunction(M,
2383 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2384 {});
2385}
2386
2387Value *OpenMPIRBuilder::getGPUWarpSize() {
2388 return Builder.CreateCall(
2389 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2390}
2391
2392Value *OpenMPIRBuilder::getNVPTXWarpID() {
2393 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2394 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2395}
2396
2397Value *OpenMPIRBuilder::getNVPTXLaneID() {
2398 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2399 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2400 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2401 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2402 "nvptx_lane_id");
2403}
2404
2405Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2406 Type *ToType) {
2407 Type *FromType = From->getType();
2408 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2409 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2410 assert(FromSize > 0 && "From size must be greater than zero");
2411 assert(ToSize > 0 && "To size must be greater than zero");
2412 if (FromType == ToType)
2413 return From;
2414 if (FromSize == ToSize)
2415 return Builder.CreateBitCast(From, ToType);
2416 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2417 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2418 InsertPointTy SaveIP = Builder.saveIP();
2419 Builder.restoreIP(AllocaIP);
2420 Value *CastItem = Builder.CreateAlloca(ToType);
2421 Builder.restoreIP(SaveIP);
2422
2423 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2424 CastItem, Builder.getPtrTy(0));
2425 Builder.CreateStore(From, ValCastItem);
2426 return Builder.CreateLoad(ToType, CastItem);
2427}
2428
2429Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2430 Value *Element,
2431 Type *ElementType,
2432 Value *Offset) {
2433 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2434 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2435
2436 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2437 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2438 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2439 Value *WarpSize =
2440 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2441 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2442 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2443 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2444 Value *WarpSizeCast =
2445 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2446 Value *ShuffleCall =
2447 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2448 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2449}
2450
2451void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2452 Value *DstAddr, Type *ElemType,
2453 Value *Offset, Type *ReductionArrayTy) {
2454 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2455 // Create the loop over the big sized data.
2456 // ptr = (void*)Elem;
2457 // ptrEnd = (void*) Elem + 1;
2458 // Step = 8;
2459 // while (ptr + Step < ptrEnd)
2460 // shuffle((int64_t)*ptr);
2461 // Step = 4;
2462 // while (ptr + Step < ptrEnd)
2463 // shuffle((int32_t)*ptr);
2464 // ...
2465 Type *IndexTy = Builder.getIndexTy(
2466 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2467 Value *ElemPtr = DstAddr;
2468 Value *Ptr = SrcAddr;
2469 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2470 if (Size < IntSize)
2471 continue;
2472 Type *IntType = Builder.getIntNTy(IntSize * 8);
2473 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2474 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2475 Value *SrcAddrGEP =
2476 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2477 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2478 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2479
2480 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2481 if ((Size / IntSize) > 1) {
2482 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2483 SrcAddrGEP, Builder.getPtrTy());
2484 BasicBlock *PreCondBB =
2485 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2486 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2487 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2488 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2489 emitBlock(PreCondBB, CurFunc);
2490 PHINode *PhiSrc =
2491 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2492 PhiSrc->addIncoming(Ptr, CurrentBB);
2493 PHINode *PhiDest =
2494 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2495 PhiDest->addIncoming(ElemPtr, CurrentBB);
2496 Ptr = PhiSrc;
2497 ElemPtr = PhiDest;
2498 Value *PtrDiff = Builder.CreatePtrDiff(
2499 Builder.getInt8Ty(), PtrEnd,
2500 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2501 Builder.CreateCondBr(
2502 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2503 ExitBB);
2504 emitBlock(ThenBB, CurFunc);
2505 Value *Res = createRuntimeShuffleFunction(
2506 AllocaIP,
2507 Builder.CreateAlignedLoad(
2508 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2509 IntType, Offset);
2510 Builder.CreateAlignedStore(Res, ElemPtr,
2511 M.getDataLayout().getPrefTypeAlign(ElemType));
2512 Value *LocalPtr =
2513 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2514 Value *LocalElemPtr =
2515 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2516 PhiSrc->addIncoming(LocalPtr, ThenBB);
2517 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2518 emitBranch(PreCondBB);
2519 emitBlock(ExitBB, CurFunc);
2520 } else {
2521 Value *Res = createRuntimeShuffleFunction(
2522 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2523 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2524 Res->getType()->getScalarSizeInBits())
2525 Res = Builder.CreateTrunc(Res, ElemType);
2526 Builder.CreateStore(Res, ElemPtr);
2527 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2528 ElemPtr =
2529 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2530 }
2531 Size = Size % IntSize;
2532 }
2533}
2534
2535void OpenMPIRBuilder::emitReductionListCopy(
2536 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2537 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2538 CopyOptionsTy CopyOptions) {
2539 Type *IndexTy = Builder.getIndexTy(
2540 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2541 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2542
2543 // Iterates, element-by-element, through the source Reduce list and
2544 // make a copy.
2545 for (auto En : enumerate(ReductionInfos)) {
2546 const ReductionInfo &RI = En.value();
2547 Value *SrcElementAddr = nullptr;
2548 Value *DestElementAddr = nullptr;
2549 Value *DestElementPtrAddr = nullptr;
2550 // Should we shuffle in an element from a remote lane?
2551 bool ShuffleInElement = false;
2552 // Set to true to update the pointer in the dest Reduce list to a
2553 // newly created element.
2554 bool UpdateDestListPtr = false;
2555
2556 // Step 1.1: Get the address for the src element in the Reduce list.
2557 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2558 ReductionArrayTy, SrcBase,
2559 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2560 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2561
2562 // Step 1.2: Create a temporary to store the element in the destination
2563 // Reduce list.
2564 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2565 ReductionArrayTy, DestBase,
2566 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2567 switch (Action) {
2568 case CopyAction::RemoteLaneToThread: {
2569 InsertPointTy CurIP = Builder.saveIP();
2570 Builder.restoreIP(AllocaIP);
2571 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2572 ".omp.reduction.element");
2573 DestAlloca->setAlignment(
2574 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2575 DestElementAddr = DestAlloca;
2576 DestElementAddr =
2577 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2578 DestElementAddr->getName() + ".ascast");
2579 Builder.restoreIP(CurIP);
2580 ShuffleInElement = true;
2581 UpdateDestListPtr = true;
2582 break;
2583 }
2584 case CopyAction::ThreadCopy: {
2585 DestElementAddr =
2586 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2587 break;
2588 }
2589 }
2590
2591 // Now that all active lanes have read the element in the
2592 // Reduce list, shuffle over the value from the remote lane.
2593 if (ShuffleInElement) {
2594 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2595 RemoteLaneOffset, ReductionArrayTy);
2596 } else {
2597 switch (RI.EvaluationKind) {
2598 case EvalKind::Scalar: {
2599 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2600 // Store the source element value to the dest element address.
2601 Builder.CreateStore(Elem, DestElementAddr);
2602 break;
2603 }
2604 case EvalKind::Complex: {
2605 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2606 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2607 Value *SrcReal = Builder.CreateLoad(
2608 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2609 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2610 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2611 Value *SrcImg = Builder.CreateLoad(
2612 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2613
2614 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2615 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2616 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2617 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2618 Builder.CreateStore(SrcReal, DestRealPtr);
2619 Builder.CreateStore(SrcImg, DestImgPtr);
2620 break;
2621 }
2622 case EvalKind::Aggregate: {
2623 Value *SizeVal = Builder.getInt64(
2624 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2625 Builder.CreateMemCpy(
2626 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2627 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2628 SizeVal, false);
2629 break;
2630 }
2631 };
2632 }
2633
2634 // Step 3.1: Modify reference in dest Reduce list as needed.
2635 // Modifying the reference in Reduce list to point to the newly
2636 // created element. The element is live in the current function
2637 // scope and that of functions it invokes (i.e., reduce_function).
2638 // RemoteReduceData[i] = (void*)&RemoteElem
2639 if (UpdateDestListPtr) {
2640 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2641 DestElementAddr, Builder.getPtrTy(),
2642 DestElementAddr->getName() + ".ascast");
2643 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2644 }
2645 }
2646}
2647
2648Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2649 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2650 AttributeList FuncAttrs) {
2651 InsertPointTy SavedIP = Builder.saveIP();
2652 LLVMContext &Ctx = M.getContext();
2654 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2655 /* IsVarArg */ false);
2656 Function *WcFunc =
2658 "_omp_reduction_inter_warp_copy_func", &M);
2659 WcFunc->setAttributes(FuncAttrs);
2660 WcFunc->addParamAttr(0, Attribute::NoUndef);
2661 WcFunc->addParamAttr(1, Attribute::NoUndef);
2662 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2663 Builder.SetInsertPoint(EntryBB);
2664
2665 // ReduceList: thread local Reduce list.
2666 // At the stage of the computation when this function is called, partially
2667 // aggregated values reside in the first lane of every active warp.
2668 Argument *ReduceListArg = WcFunc->getArg(0);
2669 // NumWarps: number of warps active in the parallel region. This could
2670 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2671 Argument *NumWarpsArg = WcFunc->getArg(1);
2672
2673 // This array is used as a medium to transfer, one reduce element at a time,
2674 // the data from the first lane of every warp to lanes in the first warp
2675 // in order to perform the final step of a reduction in a parallel region
2676 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2677 // for reduced latency, as well as to have a distinct copy for concurrently
2678 // executing target regions. The array is declared with common linkage so
2679 // as to be shared across compilation units.
2680 StringRef TransferMediumName =
2681 "__openmp_nvptx_data_transfer_temporary_storage";
2682 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2683 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2684 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2685 if (!TransferMedium) {
2686 TransferMedium = new GlobalVariable(
2687 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2688 UndefValue::get(ArrayTy), TransferMediumName,
2689 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2690 /*AddressSpace=*/3);
2691 }
2692
2693 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694 Value *GPUThreadID = getGPUThreadID();
2695 // nvptx_lane_id = nvptx_id % warpsize
2696 Value *LaneID = getNVPTXLaneID();
2697 // nvptx_warp_id = nvptx_id / warpsize
2698 Value *WarpID = getNVPTXWarpID();
2699
2700 InsertPointTy AllocaIP =
2701 InsertPointTy(Builder.GetInsertBlock(),
2702 Builder.GetInsertBlock()->getFirstInsertionPt());
2703 Type *Arg0Type = ReduceListArg->getType();
2704 Type *Arg1Type = NumWarpsArg->getType();
2705 Builder.restoreIP(AllocaIP);
2706 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2707 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2708 AllocaInst *NumWarpsAlloca =
2709 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2710 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2711 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2712 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2713 NumWarpsAlloca, Builder.getPtrTy(0),
2714 NumWarpsAlloca->getName() + ".ascast");
2715 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2716 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2717 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2718 InsertPointTy CodeGenIP =
2719 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2720 Builder.restoreIP(CodeGenIP);
2721
2722 Value *ReduceList =
2723 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2724
2725 for (auto En : enumerate(ReductionInfos)) {
2726 //
2727 // Warp master copies reduce element to transfer medium in __shared__
2728 // memory.
2729 //
2730 const ReductionInfo &RI = En.value();
2731 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2732 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2733 Type *CType = Builder.getIntNTy(TySize * 8);
2734
2735 unsigned NumIters = RealTySize / TySize;
2736 if (NumIters == 0)
2737 continue;
2738 Value *Cnt = nullptr;
2739 Value *CntAddr = nullptr;
2740 BasicBlock *PrecondBB = nullptr;
2741 BasicBlock *ExitBB = nullptr;
2742 if (NumIters > 1) {
2743 CodeGenIP = Builder.saveIP();
2744 Builder.restoreIP(AllocaIP);
2745 CntAddr =
2746 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2747
2748 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2749 CntAddr->getName() + ".ascast");
2750 Builder.restoreIP(CodeGenIP);
2751 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2752 CntAddr,
2753 /*Volatile=*/false);
2754 PrecondBB = BasicBlock::Create(Ctx, "precond");
2755 ExitBB = BasicBlock::Create(Ctx, "exit");
2756 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2757 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2758 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2759 /*Volatile=*/false);
2760 Value *Cmp = Builder.CreateICmpULT(
2761 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2762 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2763 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2764 }
2765
2766 // kmpc_barrier.
2767 InsertPointOrErrorTy BarrierIP1 =
2768 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2769 omp::Directive::OMPD_unknown,
2770 /* ForceSimpleCall */ false,
2771 /* CheckCancelFlag */ true);
2772 if (!BarrierIP1)
2773 return BarrierIP1.takeError();
2774 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2775 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2776 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2777
2778 // if (lane_id == 0)
2779 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2780 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2781 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2782
2783 // Reduce element = LocalReduceList[i]
2784 auto *RedListArrayTy =
2785 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2786 Type *IndexTy = Builder.getIndexTy(
2787 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2788 Value *ElemPtrPtr =
2789 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2790 {ConstantInt::get(IndexTy, 0),
2791 ConstantInt::get(IndexTy, En.index())});
2792 // elemptr = ((CopyType*)(elemptrptr)) + I
2793 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2794 if (NumIters > 1)
2795 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2796
2797 // Get pointer to location in transfer medium.
2798 // MediumPtr = &medium[warp_id]
2799 Value *MediumPtr = Builder.CreateInBoundsGEP(
2800 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2801 // elem = *elemptr
2802 //*MediumPtr = elem
2803 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2804 // Store the source element value to the dest element address.
2805 Builder.CreateStore(Elem, MediumPtr,
2806 /*IsVolatile*/ true);
2807 Builder.CreateBr(MergeBB);
2808
2809 // else
2810 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2811 Builder.CreateBr(MergeBB);
2812
2813 // endif
2814 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2815 InsertPointOrErrorTy BarrierIP2 =
2816 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2817 omp::Directive::OMPD_unknown,
2818 /* ForceSimpleCall */ false,
2819 /* CheckCancelFlag */ true);
2820 if (!BarrierIP2)
2821 return BarrierIP2.takeError();
2822
2823 // Warp 0 copies reduce element from transfer medium
2824 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2825 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2826 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2827
2828 Value *NumWarpsVal =
2829 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2830 // Up to 32 threads in warp 0 are active.
2831 Value *IsActiveThread =
2832 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2833 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2834
2835 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2836
2837 // SecMediumPtr = &medium[tid]
2838 // SrcMediumVal = *SrcMediumPtr
2839 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2840 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2841 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2842 Value *TargetElemPtrPtr =
2843 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2844 {ConstantInt::get(IndexTy, 0),
2845 ConstantInt::get(IndexTy, En.index())});
2846 Value *TargetElemPtrVal =
2847 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2848 Value *TargetElemPtr = TargetElemPtrVal;
2849 if (NumIters > 1)
2850 TargetElemPtr =
2851 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2852
2853 // *TargetElemPtr = SrcMediumVal;
2854 Value *SrcMediumValue =
2855 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2856 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2857 Builder.CreateBr(W0MergeBB);
2858
2859 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2860 Builder.CreateBr(W0MergeBB);
2861
2862 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2863
2864 if (NumIters > 1) {
2865 Cnt = Builder.CreateNSWAdd(
2866 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2867 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2868
2869 auto *CurFn = Builder.GetInsertBlock()->getParent();
2870 emitBranch(PrecondBB);
2871 emitBlock(ExitBB, CurFn);
2872 }
2873 RealTySize %= TySize;
2874 }
2875 }
2876
2877 Builder.CreateRetVoid();
2878 Builder.restoreIP(SavedIP);
2879
2880 return WcFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2885 AttributeList FuncAttrs) {
2886 LLVMContext &Ctx = M.getContext();
2887 FunctionType *FuncTy =
2888 FunctionType::get(Builder.getVoidTy(),
2889 {Builder.getPtrTy(), Builder.getInt16Ty(),
2890 Builder.getInt16Ty(), Builder.getInt16Ty()},
2891 /* IsVarArg */ false);
2892 Function *SarFunc =
2894 "_omp_reduction_shuffle_and_reduce_func", &M);
2895 SarFunc->setAttributes(FuncAttrs);
2896 SarFunc->addParamAttr(0, Attribute::NoUndef);
2897 SarFunc->addParamAttr(1, Attribute::NoUndef);
2898 SarFunc->addParamAttr(2, Attribute::NoUndef);
2899 SarFunc->addParamAttr(3, Attribute::NoUndef);
2900 SarFunc->addParamAttr(1, Attribute::SExt);
2901 SarFunc->addParamAttr(2, Attribute::SExt);
2902 SarFunc->addParamAttr(3, Attribute::SExt);
2903 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2904 Builder.SetInsertPoint(EntryBB);
2905
2906 // Thread local Reduce list used to host the values of data to be reduced.
2907 Argument *ReduceListArg = SarFunc->getArg(0);
2908 // Current lane id; could be logical.
2909 Argument *LaneIDArg = SarFunc->getArg(1);
2910 // Offset of the remote source lane relative to the current lane.
2911 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2912 // Algorithm version. This is expected to be known at compile time.
2913 Argument *AlgoVerArg = SarFunc->getArg(3);
2914
2915 Type *ReduceListArgType = ReduceListArg->getType();
2916 Type *LaneIDArgType = LaneIDArg->getType();
2917 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2918 Value *ReduceListAlloca = Builder.CreateAlloca(
2919 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2920 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2921 LaneIDArg->getName() + ".addr");
2922 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2923 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2924 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2925 AlgoVerArg->getName() + ".addr");
2926 ArrayType *RedListArrayTy =
2927 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2928
2929 // Create a local thread-private variable to host the Reduce list
2930 // from a remote lane.
2931 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2932 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2933
2934 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2935 ReduceListAlloca, ReduceListArgType,
2936 ReduceListAlloca->getName() + ".ascast");
2937 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2938 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2939 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2940 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2941 RemoteLaneOffsetAlloca->getName() + ".ascast");
2942 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2943 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2944 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2945 RemoteReductionListAlloca, Builder.getPtrTy(),
2946 RemoteReductionListAlloca->getName() + ".ascast");
2947
2948 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2949 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2950 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2951 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2952
2953 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2954 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2955 Value *RemoteLaneOffset =
2956 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2957 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2958
2959 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2960
2961 // This loop iterates through the list of reduce elements and copies,
2962 // element by element, from a remote lane in the warp to RemoteReduceList,
2963 // hosted on the thread's stack.
2964 emitReductionListCopy(
2965 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2966 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2967
2968 // The actions to be performed on the Remote Reduce list is dependent
2969 // on the algorithm version.
2970 //
2971 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2972 // LaneId % 2 == 0 && Offset > 0):
2973 // do the reduction value aggregation
2974 //
2975 // The thread local variable Reduce list is mutated in place to host the
2976 // reduced data, which is the aggregated value produced from local and
2977 // remote lanes.
2978 //
2979 // Note that AlgoVer is expected to be a constant integer known at compile
2980 // time.
2981 // When AlgoVer==0, the first conjunction evaluates to true, making
2982 // the entire predicate true during compile time.
2983 // When AlgoVer==1, the second conjunction has only the second part to be
2984 // evaluated during runtime. Other conjunctions evaluates to false
2985 // during compile time.
2986 // When AlgoVer==2, the third conjunction has only the second part to be
2987 // evaluated during runtime. Other conjunctions evaluates to false
2988 // during compile time.
2989 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2990 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2991 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2992 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2993 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2994 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2995 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2996 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2997 Value *RemoteOffsetComp =
2998 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2999 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3000 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3001 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3002
3003 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3004 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3005 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3006
3007 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3008 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3009 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3010 ReduceList, Builder.getPtrTy());
3011 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3012 RemoteListAddrCast, Builder.getPtrTy());
3013 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3014 ->addFnAttr(Attribute::NoUnwind);
3015 Builder.CreateBr(MergeBB);
3016
3017 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3018 Builder.CreateBr(MergeBB);
3019
3020 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3021
3022 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3023 // Reduce list.
3024 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3025 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3026 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3027
3028 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3029 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3030 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3031 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3032
3033 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3034 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3035 ReductionInfos, RemoteListAddrCast, ReduceList);
3036 Builder.CreateBr(CpyMergeBB);
3037
3038 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3039 Builder.CreateBr(CpyMergeBB);
3040
3041 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3042
3043 Builder.CreateRetVoid();
3044
3045 return SarFunc;
3046}
3047
3048Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3049 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3050 AttributeList FuncAttrs) {
3051 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3052 LLVMContext &Ctx = M.getContext();
3054 Builder.getVoidTy(),
3055 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3056 /* IsVarArg */ false);
3057 Function *LtGCFunc =
3059 "_omp_reduction_list_to_global_copy_func", &M);
3060 LtGCFunc->setAttributes(FuncAttrs);
3061 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3062 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3063 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3064
3065 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3066 Builder.SetInsertPoint(EntryBlock);
3067
3068 // Buffer: global reduction buffer.
3069 Argument *BufferArg = LtGCFunc->getArg(0);
3070 // Idx: index of the buffer.
3071 Argument *IdxArg = LtGCFunc->getArg(1);
3072 // ReduceList: thread local Reduce list.
3073 Argument *ReduceListArg = LtGCFunc->getArg(2);
3074
3075 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3076 BufferArg->getName() + ".addr");
3077 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3078 IdxArg->getName() + ".addr");
3079 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3080 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3081 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3082 BufferArgAlloca, Builder.getPtrTy(),
3083 BufferArgAlloca->getName() + ".ascast");
3084 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3085 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3086 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3087 ReduceListArgAlloca, Builder.getPtrTy(),
3088 ReduceListArgAlloca->getName() + ".ascast");
3089
3090 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3091 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3092 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3093
3094 Value *LocalReduceList =
3095 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3096 Value *BufferArgVal =
3097 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3098 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3099 Type *IndexTy = Builder.getIndexTy(
3100 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3101 for (auto En : enumerate(ReductionInfos)) {
3102 const ReductionInfo &RI = En.value();
3103 auto *RedListArrayTy =
3104 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3105 // Reduce element = LocalReduceList[i]
3106 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3107 RedListArrayTy, LocalReduceList,
3108 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3109 // elemptr = ((CopyType*)(elemptrptr)) + I
3110 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3111
3112 // Global = Buffer.VD[Idx];
3113 Value *BufferVD =
3114 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3115 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3116 ReductionsBufferTy, BufferVD, 0, En.index());
3117
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3121 Builder.CreateStore(TargetElement, GlobVal);
3122 break;
3123 }
3124 case EvalKind::Complex: {
3125 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3126 RI.ElementType, ElemPtr, 0, 0, ".realp");
3127 Value *SrcReal = Builder.CreateLoad(
3128 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3129 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3130 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3131 Value *SrcImg = Builder.CreateLoad(
3132 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3133
3134 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3135 RI.ElementType, GlobVal, 0, 0, ".realp");
3136 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3137 RI.ElementType, GlobVal, 0, 1, ".imagp");
3138 Builder.CreateStore(SrcReal, DestRealPtr);
3139 Builder.CreateStore(SrcImg, DestImgPtr);
3140 break;
3141 }
3142 case EvalKind::Aggregate: {
3143 Value *SizeVal =
3144 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3145 Builder.CreateMemCpy(
3146 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3147 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3148 break;
3149 }
3150 }
3151 }
3152
3153 Builder.CreateRetVoid();
3154 Builder.restoreIP(OldIP);
3155 return LtGCFunc;
3156}
3157
3158Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3159 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3160 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3161 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3162 LLVMContext &Ctx = M.getContext();
3164 Builder.getVoidTy(),
3165 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3166 /* IsVarArg */ false);
3167 Function *LtGRFunc =
3169 "_omp_reduction_list_to_global_reduce_func", &M);
3170 LtGRFunc->setAttributes(FuncAttrs);
3171 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3172 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3173 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3174
3175 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3176 Builder.SetInsertPoint(EntryBlock);
3177
3178 // Buffer: global reduction buffer.
3179 Argument *BufferArg = LtGRFunc->getArg(0);
3180 // Idx: index of the buffer.
3181 Argument *IdxArg = LtGRFunc->getArg(1);
3182 // ReduceList: thread local Reduce list.
3183 Argument *ReduceListArg = LtGRFunc->getArg(2);
3184
3185 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3186 BufferArg->getName() + ".addr");
3187 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3188 IdxArg->getName() + ".addr");
3189 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3190 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3191 auto *RedListArrayTy =
3192 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3193
3194 // 1. Build a list of reduction variables.
3195 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3196 Value *LocalReduceList =
3197 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3198
3199 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3200 BufferArgAlloca, Builder.getPtrTy(),
3201 BufferArgAlloca->getName() + ".ascast");
3202 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3203 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3204 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 ReduceListArgAlloca, Builder.getPtrTy(),
3206 ReduceListArgAlloca->getName() + ".ascast");
3207 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3208 LocalReduceList, Builder.getPtrTy(),
3209 LocalReduceList->getName() + ".ascast");
3210
3211 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3212 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3213 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3214
3215 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3216 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3217 Type *IndexTy = Builder.getIndexTy(
3218 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3219 for (auto En : enumerate(ReductionInfos)) {
3220 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3221 RedListArrayTy, LocalReduceListAddrCast,
3222 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3223 Value *BufferVD =
3224 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3225 // Global = Buffer.VD[Idx];
3226 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3227 ReductionsBufferTy, BufferVD, 0, En.index());
3228 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3229 }
3230
3231 // Call reduce_function(GlobalReduceList, ReduceList)
3232 Value *ReduceList =
3233 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3234 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3235 ->addFnAttr(Attribute::NoUnwind);
3236 Builder.CreateRetVoid();
3237 Builder.restoreIP(OldIP);
3238 return LtGRFunc;
3239}
3240
3241Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3242 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3243 AttributeList FuncAttrs) {
3244 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3245 LLVMContext &Ctx = M.getContext();
3247 Builder.getVoidTy(),
3248 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3249 /* IsVarArg */ false);
3250 Function *LtGCFunc =
3252 "_omp_reduction_global_to_list_copy_func", &M);
3253 LtGCFunc->setAttributes(FuncAttrs);
3254 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3255 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3256 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3257
3258 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3259 Builder.SetInsertPoint(EntryBlock);
3260
3261 // Buffer: global reduction buffer.
3262 Argument *BufferArg = LtGCFunc->getArg(0);
3263 // Idx: index of the buffer.
3264 Argument *IdxArg = LtGCFunc->getArg(1);
3265 // ReduceList: thread local Reduce list.
3266 Argument *ReduceListArg = LtGCFunc->getArg(2);
3267
3268 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3269 BufferArg->getName() + ".addr");
3270 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3271 IdxArg->getName() + ".addr");
3272 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3273 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3274 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3275 BufferArgAlloca, Builder.getPtrTy(),
3276 BufferArgAlloca->getName() + ".ascast");
3277 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3278 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3279 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3280 ReduceListArgAlloca, Builder.getPtrTy(),
3281 ReduceListArgAlloca->getName() + ".ascast");
3282 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3283 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3284 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3285
3286 Value *LocalReduceList =
3287 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3288 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3289 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3290 Type *IndexTy = Builder.getIndexTy(
3291 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3292 for (auto En : enumerate(ReductionInfos)) {
3293 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3294 auto *RedListArrayTy =
3295 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3296 // Reduce element = LocalReduceList[i]
3297 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3298 RedListArrayTy, LocalReduceList,
3299 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3300 // elemptr = ((CopyType*)(elemptrptr)) + I
3301 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3302 // Global = Buffer.VD[Idx];
3303 Value *BufferVD =
3304 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3305 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3306 ReductionsBufferTy, BufferVD, 0, En.index());
3307
3308 switch (RI.EvaluationKind) {
3309 case EvalKind::Scalar: {
3310 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3311 Builder.CreateStore(TargetElement, ElemPtr);
3312 break;
3313 }
3314 case EvalKind::Complex: {
3315 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3316 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3317 Value *SrcReal = Builder.CreateLoad(
3318 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3319 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3320 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3321 Value *SrcImg = Builder.CreateLoad(
3322 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3323
3324 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3325 RI.ElementType, ElemPtr, 0, 0, ".realp");
3326 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3327 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3328 Builder.CreateStore(SrcReal, DestRealPtr);
3329 Builder.CreateStore(SrcImg, DestImgPtr);
3330 break;
3331 }
3332 case EvalKind::Aggregate: {
3333 Value *SizeVal =
3334 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3335 Builder.CreateMemCpy(
3336 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3337 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3338 SizeVal, false);
3339 break;
3340 }
3341 }
3342 }
3343
3344 Builder.CreateRetVoid();
3345 Builder.restoreIP(OldIP);
3346 return LtGCFunc;
3347}
3348
3349Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3350 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3351 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3352 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3353 LLVMContext &Ctx = M.getContext();
3354 auto *FuncTy = FunctionType::get(
3355 Builder.getVoidTy(),
3356 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3357 /* IsVarArg */ false);
3358 Function *LtGRFunc =
3360 "_omp_reduction_global_to_list_reduce_func", &M);
3361 LtGRFunc->setAttributes(FuncAttrs);
3362 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3363 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3364 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3365
3366 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3367 Builder.SetInsertPoint(EntryBlock);
3368
3369 // Buffer: global reduction buffer.
3370 Argument *BufferArg = LtGRFunc->getArg(0);
3371 // Idx: index of the buffer.
3372 Argument *IdxArg = LtGRFunc->getArg(1);
3373 // ReduceList: thread local Reduce list.
3374 Argument *ReduceListArg = LtGRFunc->getArg(2);
3375
3376 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3377 BufferArg->getName() + ".addr");
3378 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3379 IdxArg->getName() + ".addr");
3380 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3381 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3382 ArrayType *RedListArrayTy =
3383 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3384
3385 // 1. Build a list of reduction variables.
3386 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3387 Value *LocalReduceList =
3388 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3389
3390 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3391 BufferArgAlloca, Builder.getPtrTy(),
3392 BufferArgAlloca->getName() + ".ascast");
3393 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3394 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3395 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3396 ReduceListArgAlloca, Builder.getPtrTy(),
3397 ReduceListArgAlloca->getName() + ".ascast");
3398 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3399 LocalReduceList, Builder.getPtrTy(),
3400 LocalReduceList->getName() + ".ascast");
3401
3402 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3403 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3404 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3405
3406 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3407 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3408 Type *IndexTy = Builder.getIndexTy(
3409 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3410 for (auto En : enumerate(ReductionInfos)) {
3411 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3412 RedListArrayTy, ReductionList,
3413 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3414 // Global = Buffer.VD[Idx];
3415 Value *BufferVD =
3416 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3417 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3418 ReductionsBufferTy, BufferVD, 0, En.index());
3419 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3420 }
3421
3422 // Call reduce_function(ReduceList, GlobalReduceList)
3423 Value *ReduceList =
3424 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3425 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3426 ->addFnAttr(Attribute::NoUnwind);
3427 Builder.CreateRetVoid();
3428 Builder.restoreIP(OldIP);
3429 return LtGRFunc;
3430}
3431
3432std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3433 std::string Suffix =
3434 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3435 return (Name + Suffix).str();
3436}
3437
3438Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3439 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3440 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3441 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3442 {Builder.getPtrTy(), Builder.getPtrTy()},
3443 /* IsVarArg */ false);
3444 std::string Name = getReductionFuncName(ReducerName);
3445 Function *ReductionFunc =
3447 ReductionFunc->setAttributes(FuncAttrs);
3448 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3449 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3450 BasicBlock *EntryBB =
3451 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3452 Builder.SetInsertPoint(EntryBB);
3453
3454 // Need to alloca memory here and deal with the pointers before getting
3455 // LHS/RHS pointers out
3456 Value *LHSArrayPtr = nullptr;
3457 Value *RHSArrayPtr = nullptr;
3458 Argument *Arg0 = ReductionFunc->getArg(0);
3459 Argument *Arg1 = ReductionFunc->getArg(1);
3460 Type *Arg0Type = Arg0->getType();
3461 Type *Arg1Type = Arg1->getType();
3462
3463 Value *LHSAlloca =
3464 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3465 Value *RHSAlloca =
3466 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3467 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3468 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3469 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3470 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3471 Builder.CreateStore(Arg0, LHSAddrCast);
3472 Builder.CreateStore(Arg1, RHSAddrCast);
3473 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3474 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3475
3476 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3477 Type *IndexTy = Builder.getIndexTy(
3478 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3479 SmallVector<Value *> LHSPtrs, RHSPtrs;
3480 for (auto En : enumerate(ReductionInfos)) {
3481 const ReductionInfo &RI = En.value();
3482 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3483 RedArrayTy, RHSArrayPtr,
3484 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3485 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3486 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RHSI8Ptr, RI.PrivateVariable->getType(),
3488 RHSI8Ptr->getName() + ".ascast");
3489
3490 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3491 RedArrayTy, LHSArrayPtr,
3492 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3493 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3494 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3495 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3496
3497 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3498 LHSPtrs.emplace_back(LHSPtr);
3499 RHSPtrs.emplace_back(RHSPtr);
3500 } else {
3501 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3502 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3503 Value *Reduced;
3504 InsertPointOrErrorTy AfterIP =
3505 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3506 if (!AfterIP)
3507 return AfterIP.takeError();
3508 if (!Builder.GetInsertBlock())
3509 return ReductionFunc;
3510
3511 Builder.restoreIP(*AfterIP);
3512 Builder.CreateStore(Reduced, LHSPtr);
3513 }
3514 }
3515
3516 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3517 for (auto En : enumerate(ReductionInfos)) {
3518 unsigned Index = En.index();
3519 const ReductionInfo &RI = En.value();
3520 Value *LHSFixupPtr, *RHSFixupPtr;
3521 Builder.restoreIP(RI.ReductionGenClang(
3522 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3523
3524 // Fix the CallBack code genereated to use the correct Values for the LHS
3525 // and RHS
3526 LHSFixupPtr->replaceUsesWithIf(
3527 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3528 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3529 ReductionFunc;
3530 });
3531 RHSFixupPtr->replaceUsesWithIf(
3532 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3533 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3534 ReductionFunc;
3535 });
3536 }
3537
3538 Builder.CreateRetVoid();
3539 return ReductionFunc;
3540}
3541
3542static void
3544 bool IsGPU) {
3545 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3546 (void)RI;
3547 assert(RI.Variable && "expected non-null variable");
3548 assert(RI.PrivateVariable && "expected non-null private variable");
3549 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3550 "expected non-null reduction generator callback");
3551 if (!IsGPU) {
3552 assert(
3553 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3554 "expected variables and their private equivalents to have the same "
3555 "type");
3556 }
3557 assert(RI.Variable->getType()->isPointerTy() &&
3558 "expected variables to be pointers");
3559 }
3560}
3561
3562OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3563 const LocationDescription &Loc, InsertPointTy AllocaIP,
3564 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3565 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3566 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3567 Value *SrcLocInfo) {
3568 if (!updateToLocation(Loc))
3569 return InsertPointTy();
3570 Builder.restoreIP(CodeGenIP);
3571 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3572 LLVMContext &Ctx = M.getContext();
3573
3574 // Source location for the ident struct
3575 if (!SrcLocInfo) {
3576 uint32_t SrcLocStrSize;
3577 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3578 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3579 }
3580
3581 if (ReductionInfos.size() == 0)
3582 return Builder.saveIP();
3583
3584 BasicBlock *ContinuationBlock = nullptr;
3585 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3586 // Copied code from createReductions
3587 BasicBlock *InsertBlock = Loc.IP.getBlock();
3588 ContinuationBlock =
3589 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3590 InsertBlock->getTerminator()->eraseFromParent();
3591 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3592 }
3593
3594 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3595 AttributeList FuncAttrs;
3596 AttrBuilder AttrBldr(Ctx);
3597 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3598 AttrBldr.addAttribute(Attr);
3599 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3600 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3601
3602 CodeGenIP = Builder.saveIP();
3603 Expected<Function *> ReductionResult =
3604 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3605 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3606 if (!ReductionResult)
3607 return ReductionResult.takeError();
3608 Function *ReductionFunc = *ReductionResult;
3609 Builder.restoreIP(CodeGenIP);
3610
3611 // Set the grid value in the config needed for lowering later on
3612 if (GridValue.has_value())
3613 Config.setGridValue(GridValue.value());
3614 else
3615 Config.setGridValue(getGridValue(T, ReductionFunc));
3616
3617 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3618 // RedList, shuffle_reduce_func, interwarp_copy_func);
3619 // or
3620 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3621 Value *Res;
3622
3623 // 1. Build a list of reduction variables.
3624 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3625 auto Size = ReductionInfos.size();
3626 Type *PtrTy = PointerType::getUnqual(Ctx);
3627 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3628 CodeGenIP = Builder.saveIP();
3629 Builder.restoreIP(AllocaIP);
3630 Value *ReductionListAlloca =
3631 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3632 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3633 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3634 Builder.restoreIP(CodeGenIP);
3635 Type *IndexTy = Builder.getIndexTy(
3636 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3637 for (auto En : enumerate(ReductionInfos)) {
3638 const ReductionInfo &RI = En.value();
3639 Value *ElemPtr = Builder.CreateInBoundsGEP(
3640 RedArrayTy, ReductionList,
3641 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3642 Value *CastElem =
3643 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3644 Builder.CreateStore(CastElem, ElemPtr);
3645 }
3646 CodeGenIP = Builder.saveIP();
3647 Function *SarFunc =
3648 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3649 Expected<Function *> CopyResult =
3650 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3651 if (!CopyResult)
3652 return CopyResult.takeError();
3653 Function *WcFunc = *CopyResult;
3654 Builder.restoreIP(CodeGenIP);
3655
3656 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3657
3658 unsigned MaxDataSize = 0;
3659 SmallVector<Type *> ReductionTypeArgs;
3660 for (auto En : enumerate(ReductionInfos)) {
3661 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3662 if (Size > MaxDataSize)
3663 MaxDataSize = Size;
3664 ReductionTypeArgs.emplace_back(En.value().ElementType);
3665 }
3666 Value *ReductionDataSize =
3667 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3668 if (!IsTeamsReduction) {
3669 Value *SarFuncCast =
3670 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
3671 Value *WcFuncCast =
3672 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
3673 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3674 WcFuncCast};
3675 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3676 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3677 Res = Builder.CreateCall(Pv2Ptr, Args);
3678 } else {
3679 CodeGenIP = Builder.saveIP();
3680 StructType *ReductionsBufferTy = StructType::create(
3681 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3682 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3683 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3684 Function *LtGCFunc = emitListToGlobalCopyFunction(
3685 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3686 Function *LtGRFunc = emitListToGlobalReduceFunction(
3687 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3688 Function *GtLCFunc = emitGlobalToListCopyFunction(
3689 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3690 Function *GtLRFunc = emitGlobalToListReduceFunction(
3691 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3692 Builder.restoreIP(CodeGenIP);
3693
3694 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3695 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3696
3697 Value *Args3[] = {SrcLocInfo,
3698 KernelTeamsReductionPtr,
3699 Builder.getInt32(ReductionBufNum),
3700 ReductionDataSize,
3701 RL,
3702 SarFunc,
3703 WcFunc,
3704 LtGCFunc,
3705 LtGRFunc,
3706 GtLCFunc,
3707 GtLRFunc};
3708
3709 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3710 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3711 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3712 }
3713
3714 // 5. Build if (res == 1)
3715 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3716 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3717 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3718 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3719
3720 // 6. Build then branch: where we have reduced values in the master
3721 // thread in each team.
3722 // __kmpc_end_reduce{_nowait}(<gtid>);
3723 // break;
3724 emitBlock(ThenBB, CurFunc);
3725
3726 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3727 for (auto En : enumerate(ReductionInfos)) {
3728 const ReductionInfo &RI = En.value();
3729 Value *LHS = RI.Variable;
3730 Value *RHS =
3731 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3732
3733 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3734 Value *LHSPtr, *RHSPtr;
3735 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3736 &LHSPtr, &RHSPtr, CurFunc));
3737
3738 // Fix the CallBack code genereated to use the correct Values for the LHS
3739 // and RHS
3740 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3741 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3742 ReductionFunc;
3743 });
3744 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3745 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3746 ReductionFunc;
3747 });
3748 } else {
3749 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3750 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3751 Value *Reduced;
3752 InsertPointOrErrorTy AfterIP =
3753 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3754 if (!AfterIP)
3755 return AfterIP.takeError();
3756 Builder.restoreIP(*AfterIP);
3757 Builder.CreateStore(Reduced, LHS, false);
3758 }
3759 }
3760 emitBlock(ExitBB, CurFunc);
3761 if (ContinuationBlock) {
3762 Builder.CreateBr(ContinuationBlock);
3763 Builder.SetInsertPoint(ContinuationBlock);
3764 }
3765 Config.setEmitLLVMUsed();
3766
3767 return Builder.saveIP();
3768}
3769
3771 Type *VoidTy = Type::getVoidTy(M.getContext());
3772 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3773 auto *FuncTy =
3774 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3776 ".omp.reduction.func", &M);
3777}
3778
3780 Function *ReductionFunc,
3782 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3783 Module *Module = ReductionFunc->getParent();
3784 BasicBlock *ReductionFuncBlock =
3785 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3786 Builder.SetInsertPoint(ReductionFuncBlock);
3787 Value *LHSArrayPtr = nullptr;
3788 Value *RHSArrayPtr = nullptr;
3789 if (IsGPU) {
3790 // Need to alloca memory here and deal with the pointers before getting
3791 // LHS/RHS pointers out
3792 //
3793 Argument *Arg0 = ReductionFunc->getArg(0);
3794 Argument *Arg1 = ReductionFunc->getArg(1);
3795 Type *Arg0Type = Arg0->getType();
3796 Type *Arg1Type = Arg1->getType();
3797
3798 Value *LHSAlloca =
3799 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3800 Value *RHSAlloca =
3801 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3802 Value *LHSAddrCast =
3803 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3804 Value *RHSAddrCast =
3805 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3806 Builder.CreateStore(Arg0, LHSAddrCast);
3807 Builder.CreateStore(Arg1, RHSAddrCast);
3808 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3809 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3810 } else {
3811 LHSArrayPtr = ReductionFunc->getArg(0);
3812 RHSArrayPtr = ReductionFunc->getArg(1);
3813 }
3814
3815 unsigned NumReductions = ReductionInfos.size();
3816 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3817
3818 for (auto En : enumerate(ReductionInfos)) {
3819 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3820 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3821 RedArrayTy, LHSArrayPtr, 0, En.index());
3822 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3823 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3824 LHSI8Ptr, RI.Variable->getType());
3825 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3826 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3827 RedArrayTy, RHSArrayPtr, 0, En.index());
3828 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3829 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3830 RHSI8Ptr, RI.PrivateVariable->getType());
3831 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3832 Value *Reduced;
3833 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3834 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3835 if (!AfterIP)
3836 return AfterIP.takeError();
3837
3838 Builder.restoreIP(*AfterIP);
3839 // TODO: Consider flagging an error.
3840 if (!Builder.GetInsertBlock())
3841 return Error::success();
3842
3843 // store is inside of the reduction region when using by-ref
3844 if (!IsByRef[En.index()])
3845 Builder.CreateStore(Reduced, LHSPtr);
3846 }
3847 Builder.CreateRetVoid();
3848 return Error::success();
3849}
3850
3851OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3852 const LocationDescription &Loc, InsertPointTy AllocaIP,
3853 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3854 bool IsNoWait, bool IsTeamsReduction) {
3855 assert(ReductionInfos.size() == IsByRef.size());
3856 if (Config.isGPU())
3857 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3858 IsNoWait, IsTeamsReduction);
3859
3860 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3861
3862 if (!updateToLocation(Loc))
3863 return InsertPointTy();
3864
3865 if (ReductionInfos.size() == 0)
3866 return Builder.saveIP();
3867
3868 BasicBlock *InsertBlock = Loc.IP.getBlock();
3869 BasicBlock *ContinuationBlock =
3870 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3871 InsertBlock->getTerminator()->eraseFromParent();
3872
3873 // Create and populate array of type-erased pointers to private reduction
3874 // values.
3875 unsigned NumReductions = ReductionInfos.size();
3876 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3877 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3878 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3879
3880 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3881
3882 for (auto En : enumerate(ReductionInfos)) {
3883 unsigned Index = En.index();
3884 const ReductionInfo &RI = En.value();
3885 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3886 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3887 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3888 }
3889
3890 // Emit a call to the runtime function that orchestrates the reduction.
3891 // Declare the reduction function in the process.
3892 Type *IndexTy = Builder.getIndexTy(
3893 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3894 Function *Func = Builder.GetInsertBlock()->getParent();
3895 Module *Module = Func->getParent();
3896 uint32_t SrcLocStrSize;
3897 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3898 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3899 return RI.AtomicReductionGen;
3900 });
3901 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3902 CanGenerateAtomic
3903 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3904 : IdentFlag(0));
3905 Value *ThreadId = getOrCreateThreadID(Ident);
3906 Constant *NumVariables = Builder.getInt32(NumReductions);
3907 const DataLayout &DL = Module->getDataLayout();
3908 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3909 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3910 Function *ReductionFunc = getFreshReductionFunc(*Module);
3911 Value *Lock = getOMPCriticalRegionLock(".reduction");
3912 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3913 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3914 : RuntimeFunction::OMPRTL___kmpc_reduce);
3915 CallInst *ReduceCall =
3916 Builder.CreateCall(ReduceFunc,
3917 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3918 ReductionFunc, Lock},
3919 "reduce");
3920
3921 // Create final reduction entry blocks for the atomic and non-atomic case.
3922 // Emit IR that dispatches control flow to one of the blocks based on the
3923 // reduction supporting the atomic mode.
3924 BasicBlock *NonAtomicRedBlock =
3925 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3926 BasicBlock *AtomicRedBlock =
3927 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3929 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3930 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3931 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3932
3933 // Populate the non-atomic reduction using the elementwise reduction function.
3934 // This loads the elements from the global and private variables and reduces
3935 // them before storing back the result to the global variable.
3936 Builder.SetInsertPoint(NonAtomicRedBlock);
3937 for (auto En : enumerate(ReductionInfos)) {
3938 const ReductionInfo &RI = En.value();
3939 Type *ValueType = RI.ElementType;
3940 // We have one less load for by-ref case because that load is now inside of
3941 // the reduction region
3942 Value *RedValue = RI.Variable;
3943 if (!IsByRef[En.index()]) {
3944 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3945 "red.value." + Twine(En.index()));
3946 }
3947 Value *PrivateRedValue =
3948 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3949 "red.private.value." + Twine(En.index()));
3950 Value *Reduced;
3951 InsertPointOrErrorTy AfterIP =
3952 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3953 if (!AfterIP)
3954 return AfterIP.takeError();
3955 Builder.restoreIP(*AfterIP);
3956
3957 if (!Builder.GetInsertBlock())
3958 return InsertPointTy();
3959 // for by-ref case, the load is inside of the reduction region
3960 if (!IsByRef[En.index()])
3961 Builder.CreateStore(Reduced, RI.Variable);
3962 }
3963 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3964 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3965 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3966 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3967 Builder.CreateBr(ContinuationBlock);
3968
3969 // Populate the atomic reduction using the atomic elementwise reduction
3970 // function. There are no loads/stores here because they will be happening
3971 // inside the atomic elementwise reduction.
3972 Builder.SetInsertPoint(AtomicRedBlock);
3973 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3974 for (const ReductionInfo &RI : ReductionInfos) {
3975 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3976 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3977 if (!AfterIP)
3978 return AfterIP.takeError();
3979 Builder.restoreIP(*AfterIP);
3980 if (!Builder.GetInsertBlock())
3981 return InsertPointTy();
3982 }
3983 Builder.CreateBr(ContinuationBlock);
3984 } else {
3985 Builder.CreateUnreachable();
3986 }
3987
3988 // Populate the outlined reduction function using the elementwise reduction
3989 // function. Partial values are extracted from the type-erased array of
3990 // pointers to private variables.
3991 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3992 IsByRef, /*isGPU=*/false);
3993 if (Err)
3994 return Err;
3995
3996 if (!Builder.GetInsertBlock())
3997 return InsertPointTy();
3998
3999 Builder.SetInsertPoint(ContinuationBlock);
4000 return Builder.saveIP();
4001}
4002
4003OpenMPIRBuilder::InsertPointOrErrorTy
4004OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4005 BodyGenCallbackTy BodyGenCB,
4006 FinalizeCallbackTy FiniCB) {
4007 if (!updateToLocation(Loc))
4008 return Loc.IP;
4009
4010 Directive OMPD = Directive::OMPD_master;
4011 uint32_t SrcLocStrSize;
4012 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4013 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4014 Value *ThreadId = getOrCreateThreadID(Ident);
4015 Value *Args[] = {Ident, ThreadId};
4016
4017 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4018 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4019
4020 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4021 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4022
4023 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4024 /*Conditional*/ true, /*hasFinalize*/ true);
4025}
4026
4027OpenMPIRBuilder::InsertPointOrErrorTy
4028OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4029 BodyGenCallbackTy BodyGenCB,
4030 FinalizeCallbackTy FiniCB, Value *Filter) {
4031 if (!updateToLocation(Loc))
4032 return Loc.IP;
4033
4034 Directive OMPD = Directive::OMPD_masked;
4035 uint32_t SrcLocStrSize;
4036 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4037 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4038 Value *ThreadId = getOrCreateThreadID(Ident);
4039 Value *Args[] = {Ident, ThreadId, Filter};
4040 Value *ArgsEnd[] = {Ident, ThreadId};
4041
4042 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4043 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4044
4045 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4046 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4047
4048 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4049 /*Conditional*/ true, /*hasFinalize*/ true);
4050}
4051
4053 llvm::FunctionCallee Callee,
4055 const llvm::Twine &Name) {
4056 llvm::CallInst *Call = Builder.CreateCall(
4057 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4058 Call->setDoesNotThrow();
4059 return Call;
4060}
4061
4062// Expects input basic block is dominated by BeforeScanBB.
4063// Once Scan directive is encountered, the code after scan directive should be
4064// dominated by AfterScanBB. Scan directive splits the code sequence to
4065// scan and input phase. Based on whether inclusive or exclusive
4066// clause is used in the scan directive and whether input loop or scan loop
4067// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4068// input loop and second is the scan loop. The code generated handles only
4069// inclusive scans now.
4070OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4071 const LocationDescription &Loc, InsertPointTy AllocaIP,
4072 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4073 bool IsInclusive, ScanInfo *ScanRedInfo) {
4074 if (ScanRedInfo->OMPFirstScanLoop) {
4075 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4076 ScanVarsType, ScanRedInfo);
4077 if (Err)
4078 return Err;
4079 }
4080 if (!updateToLocation(Loc))
4081 return Loc.IP;
4082
4083 llvm::Value *IV = ScanRedInfo->IV;
4084
4085 if (ScanRedInfo->OMPFirstScanLoop) {
4086 // Emit buffer[i] = red; at the end of the input phase.
4087 for (size_t i = 0; i < ScanVars.size(); i++) {
4088 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4089 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4090 Type *DestTy = ScanVarsType[i];
4091 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4092 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4093
4094 Builder.CreateStore(Src, Val);
4095 }
4096 }
4097 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4098 emitBlock(ScanRedInfo->OMPScanDispatch,
4099 Builder.GetInsertBlock()->getParent());
4100
4101 if (!ScanRedInfo->OMPFirstScanLoop) {
4102 IV = ScanRedInfo->IV;
4103 // Emit red = buffer[i]; at the entrance to the scan phase.
4104 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4105 for (size_t i = 0; i < ScanVars.size(); i++) {
4106 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4107 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4108 Type *DestTy = ScanVarsType[i];
4109 Value *SrcPtr =
4110 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4111 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4112 Builder.CreateStore(Src, ScanVars[i]);
4113 }
4114 }
4115
4116 // TODO: Update it to CreateBr and remove dead blocks
4117 llvm::Value *CmpI = Builder.getInt1(true);
4118 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4119 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4120 ScanRedInfo->OMPAfterScanBlock);
4121 } else {
4122 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4123 ScanRedInfo->OMPBeforeScanBlock);
4124 }
4125 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4126 Builder.GetInsertBlock()->getParent());
4127 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4128 return Builder.saveIP();
4129}
4130
4131Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4132 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4133 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4134
4135 Builder.restoreIP(AllocaIP);
4136 // Create the shared pointer at alloca IP.
4137 for (size_t i = 0; i < ScanVars.size(); i++) {
4138 llvm::Value *BuffPtr =
4139 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4140 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4141 }
4142
4143 // Allocate temporary buffer by master thread
4144 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4145 InsertPointTy CodeGenIP) -> Error {
4146 Builder.restoreIP(CodeGenIP);
4147 Value *AllocSpan =
4148 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4149 for (size_t i = 0; i < ScanVars.size(); i++) {
4150 Type *IntPtrTy = Builder.getInt32Ty();
4151 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4152 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4153 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4154 AllocSpan, nullptr, "arr");
4155 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4156 }
4157 return Error::success();
4158 };
4159 // TODO: Perform finalization actions for variables. This has to be
4160 // called for variables which have destructors/finalizers.
4161 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4162
4163 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4164 llvm::Value *FilterVal = Builder.getInt32(0);
4165 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4166 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4167
4168 if (!AfterIP)
4169 return AfterIP.takeError();
4170 Builder.restoreIP(*AfterIP);
4171 BasicBlock *InputBB = Builder.GetInsertBlock();
4172 if (InputBB->getTerminator())
4173 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4174 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4175 if (!AfterIP)
4176 return AfterIP.takeError();
4177 Builder.restoreIP(*AfterIP);
4178
4179 return Error::success();
4180}
4181
4182Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4183 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4184 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4185 InsertPointTy CodeGenIP) -> Error {
4186 Builder.restoreIP(CodeGenIP);
4187 for (ReductionInfo RedInfo : ReductionInfos) {
4188 Value *PrivateVar = RedInfo.PrivateVariable;
4189 Value *OrigVar = RedInfo.Variable;
4190 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4191 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4192
4193 Type *SrcTy = RedInfo.ElementType;
4194 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4195 "arrayOffset");
4196 Value *Src = Builder.CreateLoad(SrcTy, Val);
4197
4198 Builder.CreateStore(Src, OrigVar);
4199 Builder.CreateFree(Buff);
4200 }
4201 return Error::success();
4202 };
4203 // TODO: Perform finalization actions for variables. This has to be
4204 // called for variables which have destructors/finalizers.
4205 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4206
4207 if (ScanRedInfo->OMPScanFinish->getTerminator())
4208 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4209 else
4210 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4211
4212 llvm::Value *FilterVal = Builder.getInt32(0);
4213 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4214 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4215
4216 if (!AfterIP)
4217 return AfterIP.takeError();
4218 Builder.restoreIP(*AfterIP);
4219 BasicBlock *InputBB = Builder.GetInsertBlock();
4220 if (InputBB->getTerminator())
4221 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4222 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4223 if (!AfterIP)
4224 return AfterIP.takeError();
4225 Builder.restoreIP(*AfterIP);
4226 return Error::success();
4227}
4228
4229OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4230 const LocationDescription &Loc,
4232 ScanInfo *ScanRedInfo) {
4233
4234 if (!updateToLocation(Loc))
4235 return Loc.IP;
4236 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4237 InsertPointTy CodeGenIP) -> Error {
4238 Builder.restoreIP(CodeGenIP);
4239 Function *CurFn = Builder.GetInsertBlock()->getParent();
4240 // for (int k = 0; k <= ceil(log2(n)); ++k)
4241 llvm::BasicBlock *LoopBB =
4242 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4243 llvm::BasicBlock *ExitBB =
4244 splitBB(Builder, false, "omp.outer.log.scan.exit");
4246 Builder.GetInsertBlock()->getModule(),
4247 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4248 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4249 llvm::Value *Arg =
4250 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4251 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4253 Builder.GetInsertBlock()->getModule(),
4254 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4255 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4256 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4257 llvm::Value *NMin1 = Builder.CreateNUWSub(
4258 ScanRedInfo->Span,
4259 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4260 Builder.SetInsertPoint(InputBB);
4261 Builder.CreateBr(LoopBB);
4262 emitBlock(LoopBB, CurFn);
4263 Builder.SetInsertPoint(LoopBB);
4264
4265 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4266 // size pow2k = 1;
4267 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4268 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4269 InputBB);
4270 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4271 InputBB);
4272 // for (size i = n - 1; i >= 2 ^ k; --i)
4273 // tmp[i] op= tmp[i-pow2k];
4274 llvm::BasicBlock *InnerLoopBB =
4275 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4276 llvm::BasicBlock *InnerExitBB =
4277 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4278 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4279 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4280 emitBlock(InnerLoopBB, CurFn);
4281 Builder.SetInsertPoint(InnerLoopBB);
4282 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4283 IVal->addIncoming(NMin1, LoopBB);
4284 for (ReductionInfo RedInfo : ReductionInfos) {
4285 Value *ReductionVal = RedInfo.PrivateVariable;
4286 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4287 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4288 Type *DestTy = RedInfo.ElementType;
4289 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4290 Value *LHSPtr =
4291 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4292 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4293 Value *RHSPtr =
4294 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4295 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4296 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4298 InsertPointOrErrorTy AfterIP =
4299 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4300 if (!AfterIP)
4301 return AfterIP.takeError();
4302 Builder.CreateStore(Result, LHSPtr);
4303 }
4304 llvm::Value *NextIVal = Builder.CreateNUWSub(
4305 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4306 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4307 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4308 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4309 emitBlock(InnerExitBB, CurFn);
4310 llvm::Value *Next = Builder.CreateNUWAdd(
4311 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4312 Counter->addIncoming(Next, Builder.GetInsertBlock());
4313 // pow2k <<= 1;
4314 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4315 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4316 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4317 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4318 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4319 return Error::success();
4320 };
4321
4322 // TODO: Perform finalization actions for variables. This has to be
4323 // called for variables which have destructors/finalizers.
4324 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4325
4326 llvm::Value *FilterVal = Builder.getInt32(0);
4327 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4328 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4329
4330 if (!AfterIP)
4331 return AfterIP.takeError();
4332 Builder.restoreIP(*AfterIP);
4333 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4334
4335 if (!AfterIP)
4336 return AfterIP.takeError();
4337 Builder.restoreIP(*AfterIP);
4338 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4339 if (Err)
4340 return Err;
4341
4342 return AfterIP;
4343}
4344
4345Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4346 llvm::function_ref<Error()> InputLoopGen,
4347 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4348 ScanInfo *ScanRedInfo) {
4349
4350 {
4351 // Emit loop with input phase:
4352 // for (i: 0..<num_iters>) {
4353 // <input phase>;
4354 // buffer[i] = red;
4355 // }
4356 ScanRedInfo->OMPFirstScanLoop = true;
4357 Error Err = InputLoopGen();
4358 if (Err)
4359 return Err;
4360 }
4361 {
4362 // Emit loop with scan phase:
4363 // for (i: 0..<num_iters>) {
4364 // red = buffer[i];
4365 // <scan phase>;
4366 // }
4367 ScanRedInfo->OMPFirstScanLoop = false;
4368 Error Err = ScanLoopGen(Builder.saveIP());
4369 if (Err)
4370 return Err;
4371 }
4372 return Error::success();
4373}
4374
4375void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4376 Function *Fun = Builder.GetInsertBlock()->getParent();
4377 ScanRedInfo->OMPScanDispatch =
4378 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4379 ScanRedInfo->OMPAfterScanBlock =
4380 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4381 ScanRedInfo->OMPBeforeScanBlock =
4382 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4383 ScanRedInfo->OMPScanLoopExit =
4384 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4385}
4386CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4387 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4388 BasicBlock *PostInsertBefore, const Twine &Name) {
4389 Module *M = F->getParent();
4390 LLVMContext &Ctx = M->getContext();
4391 Type *IndVarTy = TripCount->getType();
4392
4393 // Create the basic block structure.
4394 BasicBlock *Preheader =
4395 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4396 BasicBlock *Header =
4397 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4398 BasicBlock *Cond =
4399 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4400 BasicBlock *Body =
4401 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4402 BasicBlock *Latch =
4403 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4404 BasicBlock *Exit =
4405 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4406 BasicBlock *After =
4407 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4408
4409 // Use specified DebugLoc for new instructions.
4410 Builder.SetCurrentDebugLocation(DL);
4411
4412 Builder.SetInsertPoint(Preheader);
4413 Builder.CreateBr(Header);
4414
4415 Builder.SetInsertPoint(Header);
4416 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4417 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4418 Builder.CreateBr(Cond);
4419
4420 Builder.SetInsertPoint(Cond);
4421 Value *Cmp =
4422 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4423 Builder.CreateCondBr(Cmp, Body, Exit);
4424
4425 Builder.SetInsertPoint(Body);
4426 Builder.CreateBr(Latch);
4427
4428 Builder.SetInsertPoint(Latch);
4429 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4430 "omp_" + Name + ".next", /*HasNUW=*/true);
4431 Builder.CreateBr(Header);
4432 IndVarPHI->addIncoming(Next, Latch);
4433
4434 Builder.SetInsertPoint(Exit);
4435 Builder.CreateBr(After);
4436
4437 // Remember and return the canonical control flow.
4438 LoopInfos.emplace_front();
4439 CanonicalLoopInfo *CL = &LoopInfos.front();
4440
4441 CL->Header = Header;
4442 CL->Cond = Cond;
4443 CL->Latch = Latch;
4444 CL->Exit = Exit;
4445
4446#ifndef NDEBUG
4447 CL->assertOK();
4448#endif
4449 return CL;
4450}
4451
4453OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4454 LoopBodyGenCallbackTy BodyGenCB,
4455 Value *TripCount, const Twine &Name) {
4456 BasicBlock *BB = Loc.IP.getBlock();
4457 BasicBlock *NextBB = BB->getNextNode();
4458
4459 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4460 NextBB, NextBB, Name);
4461 BasicBlock *After = CL->getAfter();
4462
4463 // If location is not set, don't connect the loop.
4464 if (updateToLocation(Loc)) {
4465 // Split the loop at the insertion point: Branch to the preheader and move
4466 // every following instruction to after the loop (the After BB). Also, the
4467 // new successor is the loop's after block.
4468 spliceBB(Builder, After, /*CreateBranch=*/false);
4469 Builder.CreateBr(CL->getPreheader());
4470 }
4471
4472 // Emit the body content. We do it after connecting the loop to the CFG to
4473 // avoid that the callback encounters degenerate BBs.
4474 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4475 return Err;
4476
4477#ifndef NDEBUG
4478 CL->assertOK();
4479#endif
4480 return CL;
4481}
4482
4483Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4484 ScanInfos.emplace_front();
4485 ScanInfo *Result = &ScanInfos.front();
4486 return Result;
4487}
4488
4490OpenMPIRBuilder::createCanonicalScanLoops(
4491 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4492 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4493 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4494 LocationDescription ComputeLoc =
4495 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4496 updateToLocation(ComputeLoc);
4497
4499
4500 Value *TripCount = calculateCanonicalLoopTripCount(
4501 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4502 ScanRedInfo->Span = TripCount;
4503 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4504 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4505
4506 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4507 Builder.restoreIP(CodeGenIP);
4508 ScanRedInfo->IV = IV;
4509 createScanBBs(ScanRedInfo);
4510 BasicBlock *InputBlock = Builder.GetInsertBlock();
4511 Instruction *Terminator = InputBlock->getTerminator();
4512 assert(Terminator->getNumSuccessors() == 1);
4513 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4514 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4515 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4516 Builder.GetInsertBlock()->getParent());
4517 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4518 emitBlock(ScanRedInfo->OMPScanLoopExit,
4519 Builder.GetInsertBlock()->getParent());
4520 Builder.CreateBr(ContinueBlock);
4521 Builder.SetInsertPoint(
4522 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4523 return BodyGenCB(Builder.saveIP(), IV);
4524 };
4525
4526 const auto &&InputLoopGen = [&]() -> Error {
4527 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4528 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4529 ComputeIP, Name, true, ScanRedInfo);
4530 if (!LoopInfo)
4531 return LoopInfo.takeError();
4532 Result.push_back(*LoopInfo);
4533 Builder.restoreIP((*LoopInfo)->getAfterIP());
4534 return Error::success();
4535 };
4536 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4538 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4539 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4540 if (!LoopInfo)
4541 return LoopInfo.takeError();
4542 Result.push_back(*LoopInfo);
4543 Builder.restoreIP((*LoopInfo)->getAfterIP());
4544 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4545 return Error::success();
4546 };
4547 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4548 if (Err)
4549 return Err;
4550 return Result;
4551}
4552
4553Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4554 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4555 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4556
4557 // Consider the following difficulties (assuming 8-bit signed integers):
4558 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4559 // DO I = 1, 100, 50
4560 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4561 // DO I = 100, 0, -128
4562
4563 // Start, Stop and Step must be of the same integer type.
4564 auto *IndVarTy = cast<IntegerType>(Start->getType());
4565 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4566 assert(IndVarTy == Step->getType() && "Step type mismatch");
4567
4568 updateToLocation(Loc);
4569
4570 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4571 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4572
4573 // Like Step, but always positive.
4574 Value *Incr = Step;
4575
4576 // Distance between Start and Stop; always positive.
4577 Value *Span;
4578
4579 // Condition whether there are no iterations are executed at all, e.g. because
4580 // UB < LB.
4581 Value *ZeroCmp;
4582
4583 if (IsSigned) {
4584 // Ensure that increment is positive. If not, negate and invert LB and UB.
4585 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4586 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4587 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4588 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4589 Span = Builder.CreateSub(UB, LB, "", false, true);
4590 ZeroCmp = Builder.CreateICmp(
4591 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4592 } else {
4593 Span = Builder.CreateSub(Stop, Start, "", true);
4594 ZeroCmp = Builder.CreateICmp(
4595 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4596 }
4597
4598 Value *CountIfLooping;
4599 if (InclusiveStop) {
4600 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4601 } else {
4602 // Avoid incrementing past stop since it could overflow.
4603 Value *CountIfTwo = Builder.CreateAdd(
4604 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4605 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4606 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4607 }
4608
4609 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4610 "omp_" + Name + ".tripcount");
4611}
4612
4613Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4614 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4615 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4616 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4617 ScanInfo *ScanRedInfo) {
4618 LocationDescription ComputeLoc =
4619 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4620
4621 Value *TripCount = calculateCanonicalLoopTripCount(
4622 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4623
4624 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4625 Builder.restoreIP(CodeGenIP);
4626 Value *Span = Builder.CreateMul(IV, Step);
4627 Value *IndVar = Builder.CreateAdd(Span, Start);
4628 if (InScan)
4629 ScanRedInfo->IV = IndVar;
4630 return BodyGenCB(Builder.saveIP(), IndVar);
4631 };
4632 LocationDescription LoopLoc =
4633 ComputeIP.isSet()
4634 ? Loc
4635 : LocationDescription(Builder.saveIP(),
4636 Builder.getCurrentDebugLocation());
4637 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4638}
4639
4640// Returns an LLVM function to call for initializing loop bounds using OpenMP
4641// static scheduling for composite `distribute parallel for` depending on
4642// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4643// integers as unsigned similarly to CanonicalLoopInfo.
4644static FunctionCallee
4646 OpenMPIRBuilder &OMPBuilder) {
4647 unsigned Bitwidth = Ty->getIntegerBitWidth();
4648 if (Bitwidth == 32)
4649 return OMPBuilder.getOrCreateRuntimeFunction(
4650 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4651 if (Bitwidth == 64)
4652 return OMPBuilder.getOrCreateRuntimeFunction(
4653 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4654 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4655}
4656
4657// Returns an LLVM function to call for initializing loop bounds using OpenMP
4658// static scheduling depending on `type`. Only i32 and i64 are supported by the
4659// runtime. Always interpret integers as unsigned similarly to
4660// CanonicalLoopInfo.
4662 OpenMPIRBuilder &OMPBuilder) {
4663 unsigned Bitwidth = Ty->getIntegerBitWidth();
4664 if (Bitwidth == 32)
4665 return OMPBuilder.getOrCreateRuntimeFunction(
4666 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4667 if (Bitwidth == 64)
4668 return OMPBuilder.getOrCreateRuntimeFunction(
4669 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4670 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4671}
4672
4673OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4674 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4675 WorksharingLoopType LoopType, bool NeedsBarrier) {
4676 assert(CLI->isValid() && "Requires a valid canonical loop");
4677 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4678 "Require dedicated allocate IP");
4679
4680 // Set up the source location value for OpenMP runtime.
4681 Builder.restoreIP(CLI->getPreheaderIP());
4682 Builder.SetCurrentDebugLocation(DL);
4683
4684 uint32_t SrcLocStrSize;
4685 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4686 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4687
4688 // Declare useful OpenMP runtime functions.
4689 Value *IV = CLI->getIndVar();
4690 Type *IVTy = IV->getType();
4691 FunctionCallee StaticInit =
4692 LoopType == WorksharingLoopType::DistributeForStaticLoop
4693 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4694 : getKmpcForStaticInitForType(IVTy, M, *this);
4695 FunctionCallee StaticFini =
4696 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4697
4698 // Allocate space for computed loop bounds as expected by the "init" function.
4699 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4700
4701 Type *I32Type = Type::getInt32Ty(M.getContext());
4702 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4703 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4704 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4705 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4706 CLI->setLastIter(PLastIter);
4707
4708 // At the end of the preheader, prepare for calling the "init" function by
4709 // storing the current loop bounds into the allocated space. A canonical loop
4710 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4711 // and produces an inclusive upper bound.
4712 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4713 Constant *Zero = ConstantInt::get(IVTy, 0);
4714 Constant *One = ConstantInt::get(IVTy, 1);
4715 Builder.CreateStore(Zero, PLowerBound);
4716 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4717 Builder.CreateStore(UpperBound, PUpperBound);
4718 Builder.CreateStore(One, PStride);
4719
4720 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4721
4722 OMPScheduleType SchedType =
4723 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4724 ? OMPScheduleType::OrderedDistribute
4726 Constant *SchedulingType =
4727 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4728
4729 // Call the "init" function and update the trip count of the loop with the
4730 // value it produced.
4732 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4733 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4734 Value *PDistUpperBound =
4735 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4736 Args.push_back(PDistUpperBound);
4737 }
4738 Args.append({PStride, One, Zero});
4739 Builder.CreateCall(StaticInit, Args);
4740 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4741 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4742 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4743 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4744 CLI->setTripCount(TripCount);
4745
4746 // Update all uses of the induction variable except the one in the condition
4747 // block that compares it with the actual upper bound, and the increment in
4748 // the latch block.
4749
4750 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4751 Builder.SetInsertPoint(CLI->getBody(),
4752 CLI->getBody()->getFirstInsertionPt());
4753 Builder.SetCurrentDebugLocation(DL);
4754 return Builder.CreateAdd(OldIV, LowerBound);
4755 });
4756
4757 // In the "exit" block, call the "fini" function.
4758 Builder.SetInsertPoint(CLI->getExit(),
4759 CLI->getExit()->getTerminator()->getIterator());
4760 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4761
4762 // Add the barrier if requested.
4763 if (NeedsBarrier) {
4764 InsertPointOrErrorTy BarrierIP =
4765 createBarrier(LocationDescription(Builder.saveIP(), DL),
4766 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4767 /* CheckCancelFlag */ false);
4768 if (!BarrierIP)
4769 return BarrierIP.takeError();
4770 }
4771
4772 InsertPointTy AfterIP = CLI->getAfterIP();
4773 CLI->invalidate();
4774
4775 return AfterIP;
4776}
4777
4778OpenMPIRBuilder::InsertPointOrErrorTy
4779OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4780 CanonicalLoopInfo *CLI,
4781 InsertPointTy AllocaIP,
4782 bool NeedsBarrier,
4783 Value *ChunkSize) {
4784 assert(CLI->isValid() && "Requires a valid canonical loop");
4785 assert(ChunkSize && "Chunk size is required");
4786
4787 LLVMContext &Ctx = CLI->getFunction()->getContext();
4788 Value *IV = CLI->getIndVar();
4789 Value *OrigTripCount = CLI->getTripCount();
4790 Type *IVTy = IV->getType();
4791 assert(IVTy->getIntegerBitWidth() <= 64 &&
4792 "Max supported tripcount bitwidth is 64 bits");
4793 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4794 : Type::getInt64Ty(Ctx);
4795 Type *I32Type = Type::getInt32Ty(M.getContext());
4796 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4797 Constant *One = ConstantInt::get(InternalIVTy, 1);
4798
4799 // Declare useful OpenMP runtime functions.
4800 FunctionCallee StaticInit =
4801 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4802 FunctionCallee StaticFini =
4803 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4804
4805 // Allocate space for computed loop bounds as expected by the "init" function.
4806 Builder.restoreIP(AllocaIP);
4807 Builder.SetCurrentDebugLocation(DL);
4808 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4809 Value *PLowerBound =
4810 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4811 Value *PUpperBound =
4812 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4813 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4814 CLI->setLastIter(PLastIter);
4815
4816 // Set up the source location value for the OpenMP runtime.
4817 Builder.restoreIP(CLI->getPreheaderIP());
4818 Builder.SetCurrentDebugLocation(DL);
4819
4820 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4821 Value *CastedChunkSize =
4822 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4823 Value *CastedTripCount =
4824 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4825
4826 Constant *SchedulingType = ConstantInt::get(
4827 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4828 Builder.CreateStore(Zero, PLowerBound);
4829 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4830 Builder.CreateStore(OrigUpperBound, PUpperBound);
4831 Builder.CreateStore(One, PStride);
4832
4833 // Call the "init" function and update the trip count of the loop with the
4834 // value it produced.
4835 uint32_t SrcLocStrSize;
4836 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4837 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4838 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4839 Builder.CreateCall(StaticInit,
4840 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4841 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4842 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4843 /*pstride=*/PStride, /*incr=*/One,
4844 /*chunk=*/CastedChunkSize});
4845
4846 // Load values written by the "init" function.
4847 Value *FirstChunkStart =
4848 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4849 Value *FirstChunkStop =
4850 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4851 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4852 Value *ChunkRange =
4853 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4854 Value *NextChunkStride =
4855 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4856
4857 // Create outer "dispatch" loop for enumerating the chunks.
4858 BasicBlock *DispatchEnter = splitBB(Builder, true);
4859 Value *DispatchCounter;
4860
4861 // It is safe to assume this didn't return an error because the callback
4862 // passed into createCanonicalLoop is the only possible error source, and it
4863 // always returns success.
4864 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4865 {Builder.saveIP(), DL},
4866 [&](InsertPointTy BodyIP, Value *Counter) {
4867 DispatchCounter = Counter;
4868 return Error::success();
4869 },
4870 FirstChunkStart, CastedTripCount, NextChunkStride,
4871 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4872 "dispatch"));
4873
4874 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4875 // not have to preserve the canonical invariant.
4876 BasicBlock *DispatchBody = DispatchCLI->getBody();
4877 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4878 BasicBlock *DispatchExit = DispatchCLI->getExit();
4879 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4880 DispatchCLI->invalidate();
4881
4882 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4883 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4884 redirectTo(CLI->getExit(), DispatchLatch, DL);
4885 redirectTo(DispatchBody, DispatchEnter, DL);
4886
4887 // Prepare the prolog of the chunk loop.
4888 Builder.restoreIP(CLI->getPreheaderIP());
4889 Builder.SetCurrentDebugLocation(DL);
4890
4891 // Compute the number of iterations of the chunk loop.
4892 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4893 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4894 Value *IsLastChunk =
4895 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4896 Value *CountUntilOrigTripCount =
4897 Builder.CreateSub(CastedTripCount, DispatchCounter);
4898 Value *ChunkTripCount = Builder.CreateSelect(
4899 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4900 Value *BackcastedChunkTC =
4901 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4902 CLI->setTripCount(BackcastedChunkTC);
4903
4904 // Update all uses of the induction variable except the one in the condition
4905 // block that compares it with the actual upper bound, and the increment in
4906 // the latch block.
4907 Value *BackcastedDispatchCounter =
4908 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4909 CLI->mapIndVar([&](Instruction *) -> Value * {
4910 Builder.restoreIP(CLI->getBodyIP());
4911 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4912 });
4913
4914 // In the "exit" block, call the "fini" function.
4915 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4916 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4917
4918 // Add the barrier if requested.
4919 if (NeedsBarrier) {
4920 InsertPointOrErrorTy AfterIP =
4921 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4922 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4923 if (!AfterIP)
4924 return AfterIP.takeError();
4925 }
4926
4927#ifndef NDEBUG
4928 // Even though we currently do not support applying additional methods to it,
4929 // the chunk loop should remain a canonical loop.
4930 CLI->assertOK();
4931#endif
4932
4933 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4934}
4935
4936// Returns an LLVM function to call for executing an OpenMP static worksharing
4937// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4938// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4939static FunctionCallee
4940getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4941 WorksharingLoopType LoopType) {
4942 unsigned Bitwidth = Ty->getIntegerBitWidth();
4943 Module &M = OMPBuilder->M;
4944 switch (LoopType) {
4945 case WorksharingLoopType::ForStaticLoop:
4946 if (Bitwidth == 32)
4947 return OMPBuilder->getOrCreateRuntimeFunction(
4948 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4949 if (Bitwidth == 64)
4950 return OMPBuilder->getOrCreateRuntimeFunction(
4951 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4952 break;
4953 case WorksharingLoopType::DistributeStaticLoop:
4954 if (Bitwidth == 32)
4955 return OMPBuilder->getOrCreateRuntimeFunction(
4956 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4957 if (Bitwidth == 64)
4958 return OMPBuilder->getOrCreateRuntimeFunction(
4959 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4960 break;
4961 case WorksharingLoopType::DistributeForStaticLoop:
4962 if (Bitwidth == 32)
4963 return OMPBuilder->getOrCreateRuntimeFunction(
4964 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4965 if (Bitwidth == 64)
4966 return OMPBuilder->getOrCreateRuntimeFunction(
4967 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4968 break;
4969 }
4970 if (Bitwidth != 32 && Bitwidth != 64) {
4971 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4972 }
4973 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4974}
4975
4976// Inserts a call to proper OpenMP Device RTL function which handles
4977// loop worksharing.
4978static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4979 WorksharingLoopType LoopType,
4980 BasicBlock *InsertBlock, Value *Ident,
4981 Value *LoopBodyArg, Value *TripCount,
4982 Function &LoopBodyFn) {
4983 Type *TripCountTy = TripCount->getType();
4984 Module &M = OMPBuilder->M;
4985 IRBuilder<> &Builder = OMPBuilder->Builder;
4986 FunctionCallee RTLFn =
4987 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4988 SmallVector<Value *, 8> RealArgs;
4989 RealArgs.push_back(Ident);
4990 RealArgs.push_back(&LoopBodyFn);
4991 RealArgs.push_back(LoopBodyArg);
4992 RealArgs.push_back(TripCount);
4993 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4994 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4995 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4996 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4997 Builder.CreateCall(RTLFn, RealArgs);
4998 return;
4999 }
5000 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5001 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5002 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5003 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5004
5005 RealArgs.push_back(
5006 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5007 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5008 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5009 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5010 }
5011 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5012
5013 Builder.CreateCall(RTLFn, RealArgs);
5014}
5015
5017 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5018 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5019 WorksharingLoopType LoopType) {
5020 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5021 BasicBlock *Preheader = CLI->getPreheader();
5022 Value *TripCount = CLI->getTripCount();
5023
5024 // After loop body outling, the loop body contains only set up
5025 // of loop body argument structure and the call to the outlined
5026 // loop body function. Firstly, we need to move setup of loop body args
5027 // into loop preheader.
5028 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5029 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5030
5031 // The next step is to remove the whole loop. We do not it need anymore.
5032 // That's why make an unconditional branch from loop preheader to loop
5033 // exit block
5034 Builder.restoreIP({Preheader, Preheader->end()});
5035 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5036 Preheader->getTerminator()->eraseFromParent();
5037 Builder.CreateBr(CLI->getExit());
5038
5039 // Delete dead loop blocks
5040 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5041 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5042 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5043 CleanUpInfo.EntryBB = CLI->getHeader();
5044 CleanUpInfo.ExitBB = CLI->getExit();
5045 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5046 DeleteDeadBlocks(BlocksToBeRemoved);
5047
5048 // Find the instruction which corresponds to loop body argument structure
5049 // and remove the call to loop body function instruction.
5050 Value *LoopBodyArg;
5051 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5052 assert(OutlinedFnUser &&
5053 "Expected unique undroppable user of outlined function");
5054 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5055 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5056 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5057 "Expected outlined function call to be located in loop preheader");
5058 // Check in case no argument structure has been passed.
5059 if (OutlinedFnCallInstruction->arg_size() > 1)
5060 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5061 else
5062 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5063 OutlinedFnCallInstruction->eraseFromParent();
5064
5065 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5066 LoopBodyArg, TripCount, OutlinedFn);
5067
5068 for (auto &ToBeDeletedItem : ToBeDeleted)
5069 ToBeDeletedItem->eraseFromParent();
5070 CLI->invalidate();
5071}
5072
5073OpenMPIRBuilder::InsertPointTy
5074OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
5075 InsertPointTy AllocaIP,
5076 WorksharingLoopType LoopType) {
5077 uint32_t SrcLocStrSize;
5078 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5079 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5080
5081 OutlineInfo OI;
5082 OI.OuterAllocaBB = CLI->getPreheader();
5083 Function *OuterFn = CLI->getPreheader()->getParent();
5084
5085 // Instructions which need to be deleted at the end of code generation
5087
5088 OI.OuterAllocaBB = AllocaIP.getBlock();
5089
5090 // Mark the body loop as region which needs to be extracted
5091 OI.EntryBB = CLI->getBody();
5092 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5093 "omp.prelatch", true);
5094
5095 // Prepare loop body for extraction
5096 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5097
5098 // Insert new loop counter variable which will be used only in loop
5099 // body.
5100 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5101 Instruction *NewLoopCntLoad =
5102 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5103 // New loop counter instructions are redundant in the loop preheader when
5104 // code generation for workshare loop is finshed. That's why mark them as
5105 // ready for deletion.
5106 ToBeDeleted.push_back(NewLoopCntLoad);
5107 ToBeDeleted.push_back(NewLoopCnt);
5108
5109 // Analyse loop body region. Find all input variables which are used inside
5110 // loop body region.
5111 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5113 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5114
5115 CodeExtractorAnalysisCache CEAC(*OuterFn);
5116 CodeExtractor Extractor(Blocks,
5117 /* DominatorTree */ nullptr,
5118 /* AggregateArgs */ true,
5119 /* BlockFrequencyInfo */ nullptr,
5120 /* BranchProbabilityInfo */ nullptr,
5121 /* AssumptionCache */ nullptr,
5122 /* AllowVarArgs */ true,
5123 /* AllowAlloca */ true,
5124 /* AllocationBlock */ CLI->getPreheader(),
5125 /* Suffix */ ".omp_wsloop",
5126 /* AggrArgsIn0AddrSpace */ true);
5127
5128 BasicBlock *CommonExit = nullptr;
5129 SetVector<Value *> SinkingCands, HoistingCands;
5130
5131 // Find allocas outside the loop body region which are used inside loop
5132 // body
5133 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5134
5135 // We need to model loop body region as the function f(cnt, loop_arg).
5136 // That's why we replace loop induction variable by the new counter
5137 // which will be one of loop body function argument
5138 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5139 CLI->getIndVar()->user_end());
5140 for (auto Use : Users) {
5141 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5142 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5143 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5144 }
5145 }
5146 }
5147 // Make sure that loop counter variable is not merged into loop body
5148 // function argument structure and it is passed as separate variable
5149 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5150
5151 // PostOutline CB is invoked when loop body function is outlined and
5152 // loop body is replaced by call to outlined function. We need to add
5153 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5154 // function will handle loop control logic.
5155 //
5156 OI.PostOutlineCB = [=, ToBeDeletedVec =
5157 std::move(ToBeDeleted)](Function &OutlinedFn) {
5158 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5159 LoopType);
5160 };
5161 addOutlineInfo(std::move(OI));
5162 return CLI->getAfterIP();
5163}
5164
5165OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5166 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5167 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5168 bool HasSimdModifier, bool HasMonotonicModifier,
5169 bool HasNonmonotonicModifier, bool HasOrderedClause,
5170 WorksharingLoopType LoopType) {
5171 if (Config.isTargetDevice())
5172 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
5173 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5174 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5175 HasNonmonotonicModifier, HasOrderedClause);
5176
5177 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5178 OMPScheduleType::ModifierOrdered;
5179 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5180 case OMPScheduleType::BaseStatic:
5181 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5182 if (IsOrdered)
5183 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5184 NeedsBarrier, ChunkSize);
5185 // FIXME: Monotonicity ignored?
5186 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5187
5188 case OMPScheduleType::BaseStaticChunked:
5189 if (IsOrdered)
5190 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5191 NeedsBarrier, ChunkSize);
5192 // FIXME: Monotonicity ignored?
5193 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5194 ChunkSize);
5195
5196 case OMPScheduleType::BaseRuntime:
5197 case OMPScheduleType::BaseAuto:
5198 case OMPScheduleType::BaseGreedy:
5199 case OMPScheduleType::BaseBalanced:
5200 case OMPScheduleType::BaseSteal:
5201 case OMPScheduleType::BaseGuidedSimd:
5202 case OMPScheduleType::BaseRuntimeSimd:
5203 assert(!ChunkSize &&
5204 "schedule type does not support user-defined chunk sizes");
5205 [[fallthrough]];
5206 case OMPScheduleType::BaseDynamicChunked:
5207 case OMPScheduleType::BaseGuidedChunked:
5208 case OMPScheduleType::BaseGuidedIterativeChunked:
5209 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5210 case OMPScheduleType::BaseStaticBalancedChunked:
5211 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5212 NeedsBarrier, ChunkSize);
5213
5214 default:
5215 llvm_unreachable("Unknown/unimplemented schedule kind");
5216 }
5217}
5218
5219/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5220/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5221/// the runtime. Always interpret integers as unsigned similarly to
5222/// CanonicalLoopInfo.
5223static FunctionCallee
5224getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5225 unsigned Bitwidth = Ty->getIntegerBitWidth();
5226 if (Bitwidth == 32)
5227 return OMPBuilder.getOrCreateRuntimeFunction(
5228 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5229 if (Bitwidth == 64)
5230 return OMPBuilder.getOrCreateRuntimeFunction(
5231 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5232 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5233}
5234
5235/// Returns an LLVM function to call for updating the next loop using OpenMP
5236/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5237/// the runtime. Always interpret integers as unsigned similarly to
5238/// CanonicalLoopInfo.
5239static FunctionCallee
5240getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5241 unsigned Bitwidth = Ty->getIntegerBitWidth();
5242 if (Bitwidth == 32)
5243 return OMPBuilder.getOrCreateRuntimeFunction(
5244 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5245 if (Bitwidth == 64)
5246 return OMPBuilder.getOrCreateRuntimeFunction(
5247 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5248 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5249}
5250
5251/// Returns an LLVM function to call for finalizing the dynamic loop using
5252/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5253/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5254static FunctionCallee
5255getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5256 unsigned Bitwidth = Ty->getIntegerBitWidth();
5257 if (Bitwidth == 32)
5258 return OMPBuilder.getOrCreateRuntimeFunction(
5259 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5260 if (Bitwidth == 64)
5261 return OMPBuilder.getOrCreateRuntimeFunction(
5262 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5263 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5264}
5265
5266OpenMPIRBuilder::InsertPointOrErrorTy
5267OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5268 InsertPointTy AllocaIP,
5269 OMPScheduleType SchedType,
5270 bool NeedsBarrier, Value *Chunk) {
5271 assert(CLI->isValid() && "Requires a valid canonical loop");
5272 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5273 "Require dedicated allocate IP");
5275 "Require valid schedule type");
5276
5277 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5278 OMPScheduleType::ModifierOrdered;
5279
5280 // Set up the source location value for OpenMP runtime.
5281 Builder.SetCurrentDebugLocation(DL);
5282
5283 uint32_t SrcLocStrSize;
5284 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5285 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5286
5287 // Declare useful OpenMP runtime functions.
5288 Value *IV = CLI->getIndVar();
5289 Type *IVTy = IV->getType();
5290 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5291 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5292
5293 // Allocate space for computed loop bounds as expected by the "init" function.
5294 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5295 Type *I32Type = Type::getInt32Ty(M.getContext());
5296 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5297 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5298 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5299 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5300 CLI->setLastIter(PLastIter);
5301
5302 // At the end of the preheader, prepare for calling the "init" function by
5303 // storing the current loop bounds into the allocated space. A canonical loop
5304 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5305 // and produces an inclusive upper bound.
5306 BasicBlock *PreHeader = CLI->getPreheader();
5307 Builder.SetInsertPoint(PreHeader->getTerminator());
5308 Constant *One = ConstantInt::get(IVTy, 1);
5309 Builder.CreateStore(One, PLowerBound);
5310 Value *UpperBound = CLI->getTripCount();
5311 Builder.CreateStore(UpperBound, PUpperBound);
5312 Builder.CreateStore(One, PStride);
5313
5314 BasicBlock *Header = CLI->getHeader();
5315 BasicBlock *Exit = CLI->getExit();
5316 BasicBlock *Cond = CLI->getCond();
5317 BasicBlock *Latch = CLI->getLatch();
5318 InsertPointTy AfterIP = CLI->getAfterIP();
5319
5320 // The CLI will be "broken" in the code below, as the loop is no longer
5321 // a valid canonical loop.
5322
5323 if (!Chunk)
5324 Chunk = One;
5325
5326 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5327
5328 Constant *SchedulingType =
5329 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5330
5331 // Call the "init" function.
5332 Builder.CreateCall(DynamicInit,
5333 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5334 UpperBound, /* step */ One, Chunk});
5335
5336 // An outer loop around the existing one.
5337 BasicBlock *OuterCond = BasicBlock::Create(
5338 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5339 PreHeader->getParent());
5340 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5341 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5342 Value *Res =
5343 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5344 PLowerBound, PUpperBound, PStride});
5345 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5346 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5347 Value *LowerBound =
5348 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5349 Builder.CreateCondBr(MoreWork, Header, Exit);
5350
5351 // Change PHI-node in loop header to use outer cond rather than preheader,
5352 // and set IV to the LowerBound.
5353 Instruction *Phi = &Header->front();
5354 auto *PI = cast<PHINode>(Phi);
5355 PI->setIncomingBlock(0, OuterCond);
5356 PI->setIncomingValue(0, LowerBound);
5357
5358 // Then set the pre-header to jump to the OuterCond
5359 Instruction *Term = PreHeader->getTerminator();
5360 auto *Br = cast<BranchInst>(Term);
5361 Br->setSuccessor(0, OuterCond);
5362
5363 // Modify the inner condition:
5364 // * Use the UpperBound returned from the DynamicNext call.
5365 // * jump to the loop outer loop when done with one of the inner loops.
5366 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5367 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5368 Instruction *Comp = &*Builder.GetInsertPoint();
5369 auto *CI = cast<CmpInst>(Comp);
5370 CI->setOperand(1, UpperBound);
5371 // Redirect the inner exit to branch to outer condition.
5372 Instruction *Branch = &Cond->back();
5373 auto *BI = cast<BranchInst>(Branch);
5374 assert(BI->getSuccessor(1) == Exit);
5375 BI->setSuccessor(1, OuterCond);
5376
5377 // Call the "fini" function if "ordered" is present in wsloop directive.
5378 if (Ordered) {
5379 Builder.SetInsertPoint(&Latch->back());
5380 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5381 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5382 }
5383
5384 // Add the barrier if requested.
5385 if (NeedsBarrier) {
5386 Builder.SetInsertPoint(&Exit->back());
5387 InsertPointOrErrorTy BarrierIP =
5388 createBarrier(LocationDescription(Builder.saveIP(), DL),
5389 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5390 /* CheckCancelFlag */ false);
5391 if (!BarrierIP)
5392 return BarrierIP.takeError();
5393 }
5394
5395 CLI->invalidate();
5396 return AfterIP;
5397}
5398
5399/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5400/// after this \p OldTarget will be orphaned.
5402 BasicBlock *NewTarget, DebugLoc DL) {
5403 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5404 redirectTo(Pred, NewTarget, DL);
5405}
5406
5407/// Determine which blocks in \p BBs are reachable from outside and remove the
5408/// ones that are not reachable from the function.
5411 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5412 for (Use &U : BB->uses()) {
5413 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5414 if (!UseInst)
5415 continue;
5416 if (BBsToErase.count(UseInst->getParent()))
5417 continue;
5418 return true;
5419 }
5420 return false;
5421 };
5422
5423 while (BBsToErase.remove_if(HasRemainingUses)) {
5424 // Try again if anything was removed.
5425 }
5426
5427 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5428 DeleteDeadBlocks(BBVec);
5429}
5430
5431CanonicalLoopInfo *
5432OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5433 InsertPointTy ComputeIP) {
5434 assert(Loops.size() >= 1 && "At least one loop required");
5435 size_t NumLoops = Loops.size();
5436
5437 // Nothing to do if there is already just one loop.
5438 if (NumLoops == 1)
5439 return Loops.front();
5440
5441 CanonicalLoopInfo *Outermost = Loops.front();
5442 CanonicalLoopInfo *Innermost = Loops.back();
5443 BasicBlock *OrigPreheader = Outermost->getPreheader();
5444 BasicBlock *OrigAfter = Outermost->getAfter();
5445 Function *F = OrigPreheader->getParent();
5446
5447 // Loop control blocks that may become orphaned later.
5448 SmallVector<BasicBlock *, 12> OldControlBBs;
5449 OldControlBBs.reserve(6 * Loops.size());
5450 for (CanonicalLoopInfo *Loop : Loops)
5451 Loop->collectControlBlocks(OldControlBBs);
5452
5453 // Setup the IRBuilder for inserting the trip count computation.
5454 Builder.SetCurrentDebugLocation(DL);
5455 if (ComputeIP.isSet())
5456 Builder.restoreIP(ComputeIP);
5457 else
5458 Builder.restoreIP(Outermost->getPreheaderIP());
5459
5460 // Derive the collapsed' loop trip count.
5461 // TODO: Find common/largest indvar type.
5462 Value *CollapsedTripCount = nullptr;
5463 for (CanonicalLoopInfo *L : Loops) {
5464 assert(L->isValid() &&
5465 "All loops to collapse must be valid canonical loops");
5466 Value *OrigTripCount = L->getTripCount();
5467 if (!CollapsedTripCount) {
5468 CollapsedTripCount = OrigTripCount;
5469 continue;
5470 }
5471
5472 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5473 CollapsedTripCount = Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5474 }
5475
5476 // Create the collapsed loop control flow.
5477 CanonicalLoopInfo *Result =
5478 createLoopSkeleton(DL, CollapsedTripCount, F,
5479 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5480
5481 // Build the collapsed loop body code.
5482 // Start with deriving the input loop induction variables from the collapsed
5483 // one, using a divmod scheme. To preserve the original loops' order, the
5484 // innermost loop use the least significant bits.
5485 Builder.restoreIP(Result->getBodyIP());
5486
5487 Value *Leftover = Result->getIndVar();
5488 SmallVector<Value *> NewIndVars;
5489 NewIndVars.resize(NumLoops);
5490 for (int i = NumLoops - 1; i >= 1; --i) {
5491 Value *OrigTripCount = Loops[i]->getTripCount();
5492
5493 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5494 NewIndVars[i] = NewIndVar;
5495
5496 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5497 }
5498 // Outermost loop gets all the remaining bits.
5499 NewIndVars[0] = Leftover;
5500
5501 // Construct the loop body control flow.
5502 // We progressively construct the branch structure following in direction of
5503 // the control flow, from the leading in-between code, the loop nest body, the
5504 // trailing in-between code, and rejoining the collapsed loop's latch.
5505 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5506 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5507 // its predecessors as sources.
5508 BasicBlock *ContinueBlock = Result->getBody();
5509 BasicBlock *ContinuePred = nullptr;
5510 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5511 BasicBlock *NextSrc) {
5512 if (ContinueBlock)
5513 redirectTo(ContinueBlock, Dest, DL);
5514 else
5515 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5516
5517 ContinueBlock = nullptr;
5518 ContinuePred = NextSrc;
5519 };
5520
5521 // The code before the nested loop of each level.
5522 // Because we are sinking it into the nest, it will be executed more often
5523 // that the original loop. More sophisticated schemes could keep track of what
5524 // the in-between code is and instantiate it only once per thread.
5525 for (size_t i = 0; i < NumLoops - 1; ++i)
5526 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5527
5528 // Connect the loop nest body.
5529 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5530
5531 // The code after the nested loop at each level.
5532 for (size_t i = NumLoops - 1; i > 0; --i)
5533 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5534
5535 // Connect the finished loop to the collapsed loop latch.
5536 ContinueWith(Result->getLatch(), nullptr);
5537
5538 // Replace the input loops with the new collapsed loop.
5539 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5540 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5541
5542 // Replace the input loop indvars with the derived ones.
5543 for (size_t i = 0; i < NumLoops; ++i)
5544 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5545
5546 // Remove unused parts of the input loops.
5547 removeUnusedBlocksFromParent(OldControlBBs);
5548
5549 for (CanonicalLoopInfo *L : Loops)
5550 L->invalidate();
5551
5552#ifndef NDEBUG
5553 Result->assertOK();
5554#endif
5555 return Result;
5556}
5557
5558std::vector<CanonicalLoopInfo *>
5559OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5560 ArrayRef<Value *> TileSizes) {
5561 assert(TileSizes.size() == Loops.size() &&
5562 "Must pass as many tile sizes as there are loops");
5563 int NumLoops = Loops.size();
5564 assert(NumLoops >= 1 && "At least one loop to tile required");
5565
5566 CanonicalLoopInfo *OutermostLoop = Loops.front();
5567 CanonicalLoopInfo *InnermostLoop = Loops.back();
5568 Function *F = OutermostLoop->getBody()->getParent();
5569 BasicBlock *InnerEnter = InnermostLoop->getBody();
5570 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5571
5572 // Loop control blocks that may become orphaned later.
5573 SmallVector<BasicBlock *, 12> OldControlBBs;
5574 OldControlBBs.reserve(6 * Loops.size());
5575 for (CanonicalLoopInfo *Loop : Loops)
5576 Loop->collectControlBlocks(OldControlBBs);
5577
5578 // Collect original trip counts and induction variable to be accessible by
5579 // index. Also, the structure of the original loops is not preserved during
5580 // the construction of the tiled loops, so do it before we scavenge the BBs of
5581 // any original CanonicalLoopInfo.
5582 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5583 for (CanonicalLoopInfo *L : Loops) {
5584 assert(L->isValid() && "All input loops must be valid canonical loops");
5585 OrigTripCounts.push_back(L->getTripCount());
5586 OrigIndVars.push_back(L->getIndVar());
5587 }
5588
5589 // Collect the code between loop headers. These may contain SSA definitions
5590 // that are used in the loop nest body. To be usable with in the innermost
5591 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5592 // these instructions may be executed more often than before the tiling.
5593 // TODO: It would be sufficient to only sink them into body of the
5594 // corresponding tile loop.
5596 for (int i = 0; i < NumLoops - 1; ++i) {
5597 CanonicalLoopInfo *Surrounding = Loops[i];
5598 CanonicalLoopInfo *Nested = Loops[i + 1];
5599
5600 BasicBlock *EnterBB = Surrounding->getBody();
5601 BasicBlock *ExitBB = Nested->getHeader();
5602 InbetweenCode.emplace_back(EnterBB, ExitBB);
5603 }
5604
5605 // Compute the trip counts of the floor loops.
5606 Builder.SetCurrentDebugLocation(DL);
5607 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5608 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5609 for (int i = 0; i < NumLoops; ++i) {
5610 Value *TileSize = TileSizes[i];
5611 Value *OrigTripCount = OrigTripCounts[i];
5612 Type *IVType = OrigTripCount->getType();
5613
5614 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5615 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5616
5617 // 0 if tripcount divides the tilesize, 1 otherwise.
5618 // 1 means we need an additional iteration for a partial tile.
5619 //
5620 // Unfortunately we cannot just use the roundup-formula
5621 // (tripcount + tilesize - 1)/tilesize
5622 // because the summation might overflow. We do not want introduce undefined
5623 // behavior when the untiled loop nest did not.
5624 Value *FloorTripOverflow =
5625 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5626
5627 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5628 Value *FloorTripCount =
5629 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5630 "omp_floor" + Twine(i) + ".tripcount", true);
5631
5632 // Remember some values for later use.
5633 FloorCompleteCount.push_back(FloorCompleteTripCount);
5634 FloorCount.push_back(FloorTripCount);
5635 FloorRems.push_back(FloorTripRem);
5636 }
5637
5638 // Generate the new loop nest, from the outermost to the innermost.
5639 std::vector<CanonicalLoopInfo *> Result;
5640 Result.reserve(NumLoops * 2);
5641
5642 // The basic block of the surrounding loop that enters the nest generated
5643 // loop.
5644 BasicBlock *Enter = OutermostLoop->getPreheader();
5645
5646 // The basic block of the surrounding loop where the inner code should
5647 // continue.
5648 BasicBlock *Continue = OutermostLoop->getAfter();
5649
5650 // Where the next loop basic block should be inserted.
5651 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5652
5653 auto EmbeddNewLoop =
5654 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5655 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5656 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5657 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5658 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5659 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5660
5661 // Setup the position where the next embedded loop connects to this loop.
5662 Enter = EmbeddedLoop->getBody();
5663 Continue = EmbeddedLoop->getLatch();
5664 OutroInsertBefore = EmbeddedLoop->getLatch();
5665 return EmbeddedLoop;
5666 };
5667
5668 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5669 const Twine &NameBase) {
5670 for (auto P : enumerate(TripCounts)) {
5671 CanonicalLoopInfo *EmbeddedLoop =
5672 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5673 Result.push_back(EmbeddedLoop);
5674 }
5675 };
5676
5677 EmbeddNewLoops(FloorCount, "floor");
5678
5679 // Within the innermost floor loop, emit the code that computes the tile
5680 // sizes.
5681 Builder.SetInsertPoint(Enter->getTerminator());
5682 SmallVector<Value *, 4> TileCounts;
5683 for (int i = 0; i < NumLoops; ++i) {
5684 CanonicalLoopInfo *FloorLoop = Result[i];
5685 Value *TileSize = TileSizes[i];
5686
5687 Value *FloorIsEpilogue =
5688 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5689 Value *TileTripCount =
5690 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5691
5692 TileCounts.push_back(TileTripCount);
5693 }
5694
5695 // Create the tile loops.
5696 EmbeddNewLoops(TileCounts, "tile");
5697
5698 // Insert the inbetween code into the body.
5699 BasicBlock *BodyEnter = Enter;
5700 BasicBlock *BodyEntered = nullptr;
5701 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5702 BasicBlock *EnterBB = P.first;
5703 BasicBlock *ExitBB = P.second;
5704
5705 if (BodyEnter)
5706 redirectTo(BodyEnter, EnterBB, DL);
5707 else
5708 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5709
5710 BodyEnter = nullptr;
5711 BodyEntered = ExitBB;
5712 }
5713
5714 // Append the original loop nest body into the generated loop nest body.
5715 if (BodyEnter)
5716 redirectTo(BodyEnter, InnerEnter, DL);
5717 else
5718 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5720
5721 // Replace the original induction variable with an induction variable computed
5722 // from the tile and floor induction variables.
5723 Builder.restoreIP(Result.back()->getBodyIP());
5724 for (int i = 0; i < NumLoops; ++i) {
5725 CanonicalLoopInfo *FloorLoop = Result[i];
5726 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5727 Value *OrigIndVar = OrigIndVars[i];
5728 Value *Size = TileSizes[i];
5729
5730 Value *Scale =
5731 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5732 Value *Shift =
5733 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5734 OrigIndVar->replaceAllUsesWith(Shift);
5735 }
5736
5737 // Remove unused parts of the original loops.
5738 removeUnusedBlocksFromParent(OldControlBBs);
5739
5740 for (CanonicalLoopInfo *L : Loops)
5741 L->invalidate();
5742
5743#ifndef NDEBUG
5744 for (CanonicalLoopInfo *GenL : Result)
5745 GenL->assertOK();
5746#endif
5747 return Result;
5748}
5749
5750/// Attach metadata \p Properties to the basic block described by \p BB. If the
5751/// basic block already has metadata, the basic block properties are appended.
5753 ArrayRef<Metadata *> Properties) {
5754 // Nothing to do if no property to attach.
5755 if (Properties.empty())
5756 return;
5757
5758 LLVMContext &Ctx = BB->getContext();
5759 SmallVector<Metadata *> NewProperties;
5760 NewProperties.push_back(nullptr);
5761
5762 // If the basic block already has metadata, prepend it to the new metadata.
5763 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5764 if (Existing)
5765 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5766
5767 append_range(NewProperties, Properties);
5768 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5769 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5770
5771 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5772}
5773
5774/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5775/// loop already has metadata, the loop properties are appended.
5776static void addLoopMetadata(CanonicalLoopInfo *Loop,
5777 ArrayRef<Metadata *> Properties) {
5778 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5779
5780 // Attach metadata to the loop's latch
5781 BasicBlock *Latch = Loop->getLatch();
5782 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5783 addBasicBlockMetadata(Latch, Properties);
5784}
5785
5786/// Attach llvm.access.group metadata to the memref instructions of \p Block
5787static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5788 LoopInfo &LI) {
5789 for (Instruction &I : *Block) {
5790 if (I.mayReadOrWriteMemory()) {
5791 // TODO: This instruction may already have access group from
5792 // other pragmas e.g. #pragma clang loop vectorize. Append
5793 // so that the existing metadata is not overwritten.
5794 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5795 }
5796 }
5797}
5798
5799void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5800 LLVMContext &Ctx = Builder.getContext();
5802 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5803 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5804}
5805
5806void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5807 LLVMContext &Ctx = Builder.getContext();
5809 Loop, {
5810 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5811 });
5812}
5813
5814void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5815 Value *IfCond, ValueToValueMapTy &VMap,
5816 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5817 const Twine &NamePrefix) {
5818 Function *F = CanonicalLoop->getFunction();
5819
5820 // We can't do
5821 // if (cond) {
5822 // simd_loop;
5823 // } else {
5824 // non_simd_loop;
5825 // }
5826 // because then the CanonicalLoopInfo would only point to one of the loops:
5827 // leading to other constructs operating on the same loop to malfunction.
5828 // Instead generate
5829 // while (...) {
5830 // if (cond) {
5831 // simd_body;
5832 // } else {
5833 // not_simd_body;
5834 // }
5835 // }
5836 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5837 // body at -O3
5838
5839 // Define where if branch should be inserted
5840 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5841
5842 // Create additional blocks for the if statement
5843 BasicBlock *Cond = SplitBeforeIt->getParent();
5844 llvm::LLVMContext &C = Cond->getContext();
5846 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5848 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5849
5850 // Create if condition branch.
5851 Builder.SetInsertPoint(SplitBeforeIt);
5852 Instruction *BrInstr =
5853 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5854 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5855 // Then block contains branch to omp loop body which needs to be vectorized
5856 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5857 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5858
5859 Builder.SetInsertPoint(ElseBlock);
5860
5861 // Clone loop for the else branch
5863
5864 SmallVector<BasicBlock *, 8> ExistingBlocks;
5865 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5866 ExistingBlocks.push_back(ThenBlock);
5867 ExistingBlocks.append(L->block_begin(), L->block_end());
5868 // Cond is the block that has the if clause condition
5869 // LoopCond is omp_loop.cond
5870 // LoopHeader is omp_loop.header
5871 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5872 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5873 assert(LoopCond && LoopHeader && "Invalid loop structure");
5874 for (BasicBlock *Block : ExistingBlocks) {
5875 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5876 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5877 continue;
5878 }
5879 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5880
5881 // fix name not to be omp.if.then
5882 if (Block == ThenBlock)
5883 NewBB->setName(NamePrefix + ".if.else");
5884
5885 NewBB->moveBefore(CanonicalLoop->getExit());
5886 VMap[Block] = NewBB;
5887 NewBlocks.push_back(NewBB);
5888 }
5889 remapInstructionsInBlocks(NewBlocks, VMap);
5890 Builder.CreateBr(NewBlocks.front());
5891
5892 // The loop latch must have only one predecessor. Currently it is branched to
5893 // from both the 'then' and 'else' branches.
5894 L->getLoopLatch()->splitBasicBlock(
5895 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5896
5897 // Ensure that the then block is added to the loop so we add the attributes in
5898 // the next step
5899 L->addBasicBlockToLoop(ThenBlock, LI);
5900}
5901
5902unsigned
5903OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5904 const StringMap<bool> &Features) {
5905 if (TargetTriple.isX86()) {
5906 if (Features.lookup("avx512f"))
5907 return 512;
5908 else if (Features.lookup("avx"))
5909 return 256;
5910 return 128;
5911 }
5912 if (TargetTriple.isPPC())
5913 return 128;
5914 if (TargetTriple.isWasm())
5915 return 128;
5916 return 0;
5917}
5918
5919void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5920 MapVector<Value *, Value *> AlignedVars,
5921 Value *IfCond, OrderKind Order,
5922 ConstantInt *Simdlen, ConstantInt *Safelen) {
5923 LLVMContext &Ctx = Builder.getContext();
5924
5925 Function *F = CanonicalLoop->getFunction();
5926
5927 // TODO: We should not rely on pass manager. Currently we use pass manager
5928 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5929 // object. We should have a method which returns all blocks between
5930 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5932 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5933 FAM.registerPass([]() { return LoopAnalysis(); });
5934 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5935
5936 LoopAnalysis LIA;
5937 LoopInfo &&LI = LIA.run(*F, FAM);
5938
5939 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5940 if (AlignedVars.size()) {
5941 InsertPointTy IP = Builder.saveIP();
5942 for (auto &AlignedItem : AlignedVars) {
5943 Value *AlignedPtr = AlignedItem.first;
5944 Value *Alignment = AlignedItem.second;
5945 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5946 Builder.SetInsertPoint(loadInst->getNextNode());
5947 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5948 Alignment);
5949 }
5950 Builder.restoreIP(IP);
5951 }
5952
5953 if (IfCond) {
5954 ValueToValueMapTy VMap;
5955 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5956 }
5957
5959
5960 // Get the basic blocks from the loop in which memref instructions
5961 // can be found.
5962 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5963 // preferably without running any passes.
5964 for (BasicBlock *Block : L->getBlocks()) {
5965 if (Block == CanonicalLoop->getCond() ||
5966 Block == CanonicalLoop->getHeader())
5967 continue;
5968 Reachable.insert(Block);
5969 }
5970
5971 SmallVector<Metadata *> LoopMDList;
5972
5973 // In presence of finite 'safelen', it may be unsafe to mark all
5974 // the memory instructions parallel, because loop-carried
5975 // dependences of 'safelen' iterations are possible.
5976 // If clause order(concurrent) is specified then the memory instructions
5977 // are marked parallel even if 'safelen' is finite.
5978 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5979 // Add access group metadata to memory-access instructions.
5980 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5981 for (BasicBlock *BB : Reachable)
5982 addSimdMetadata(BB, AccessGroup, LI);
5983 // TODO: If the loop has existing parallel access metadata, have
5984 // to combine two lists.
5985 LoopMDList.push_back(MDNode::get(
5986 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5987 }
5988
5989 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5990 // versions so we can't add the loop attributes in that case.
5991 if (IfCond) {
5992 // we can still add llvm.loop.parallel_access
5993 addLoopMetadata(CanonicalLoop, LoopMDList);
5994 return;
5995 }
5996
5997 // Use the above access group metadata to create loop level
5998 // metadata, which should be distinct for each loop.
5999 ConstantAsMetadata *BoolConst =
6001 LoopMDList.push_back(MDNode::get(
6002 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6003
6004 if (Simdlen || Safelen) {
6005 // If both simdlen and safelen clauses are specified, the value of the
6006 // simdlen parameter must be less than or equal to the value of the safelen
6007 // parameter. Therefore, use safelen only in the absence of simdlen.
6008 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6009 LoopMDList.push_back(
6010 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6011 ConstantAsMetadata::get(VectorizeWidth)}));
6012 }
6013
6014 addLoopMetadata(CanonicalLoop, LoopMDList);
6015}
6016
6017/// Create the TargetMachine object to query the backend for optimization
6018/// preferences.
6019///
6020/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6021/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6022/// needed for the LLVM pass pipline. We use some default options to avoid
6023/// having to pass too many settings from the frontend that probably do not
6024/// matter.
6025///
6026/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6027/// method. If we are going to use TargetMachine for more purposes, especially
6028/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6029/// might become be worth requiring front-ends to pass on their TargetMachine,
6030/// or at least cache it between methods. Note that while fontends such as Clang
6031/// have just a single main TargetMachine per translation unit, "target-cpu" and
6032/// "target-features" that determine the TargetMachine are per-function and can
6033/// be overrided using __attribute__((target("OPTIONS"))).
6034static std::unique_ptr<TargetMachine>
6036 Module *M = F->getParent();
6037
6038 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6039 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6040 const llvm::Triple &Triple = M->getTargetTriple();
6041
6042 std::string Error;
6044 if (!TheTarget)
6045 return {};
6046
6048 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6049 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6050 /*CodeModel=*/std::nullopt, OptLevel));
6051}
6052
6053/// Heuristically determine the best-performant unroll factor for \p CLI. This
6054/// depends on the target processor. We are re-using the same heuristics as the
6055/// LoopUnrollPass.
6056static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6057 Function *F = CLI->getFunction();
6058
6059 // Assume the user requests the most aggressive unrolling, even if the rest of
6060 // the code is optimized using a lower setting.
6062 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6063
6065 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6066 FAM.registerPass([]() { return AssumptionAnalysis(); });
6067 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6068 FAM.registerPass([]() { return LoopAnalysis(); });
6069 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6070 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6071 TargetIRAnalysis TIRA;
6072 if (TM)
6073 TIRA = TargetIRAnalysis(
6074 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6075 FAM.registerPass([&]() { return TIRA; });
6076
6077 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6079 ScalarEvolution &&SE = SEA.run(*F, FAM);
6081 DominatorTree &&DT = DTA.run(*F, FAM);
6082 LoopAnalysis LIA;
6083 LoopInfo &&LI = LIA.run(*F, FAM);
6085 AssumptionCache &&AC = ACT.run(*F, FAM);
6087
6088 Loop *L = LI.getLoopFor(CLI->getHeader());
6089 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6090
6092 L, SE, TTI,
6093 /*BlockFrequencyInfo=*/nullptr,
6094 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6095 /*UserThreshold=*/std::nullopt,
6096 /*UserCount=*/std::nullopt,
6097 /*UserAllowPartial=*/true,
6098 /*UserAllowRuntime=*/true,
6099 /*UserUpperBound=*/std::nullopt,
6100 /*UserFullUnrollMaxCount=*/std::nullopt);
6101
6102 UP.Force = true;
6103
6104 // Account for additional optimizations taking place before the LoopUnrollPass
6105 // would unroll the loop.
6108
6109 // Use normal unroll factors even if the rest of the code is optimized for
6110 // size.
6113
6114 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6115 << " Threshold=" << UP.Threshold << "\n"
6116 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6117 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6118 << " PartialOptSizeThreshold="
6119 << UP.PartialOptSizeThreshold << "\n");
6120
6121 // Disable peeling.
6124 /*UserAllowPeeling=*/false,
6125 /*UserAllowProfileBasedPeeling=*/false,
6126 /*UnrollingSpecficValues=*/false);
6127
6129 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6130
6131 // Assume that reads and writes to stack variables can be eliminated by
6132 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6133 // size.
6134 for (BasicBlock *BB : L->blocks()) {
6135 for (Instruction &I : *BB) {
6136 Value *Ptr;
6137 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6138 Ptr = Load->getPointerOperand();
6139 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6140 Ptr = Store->getPointerOperand();
6141 } else
6142 continue;
6143
6144 Ptr = Ptr->stripPointerCasts();
6145
6146 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6147 if (Alloca->getParent() == &F->getEntryBlock())
6148 EphValues.insert(&I);
6149 }
6150 }
6151 }
6152
6153 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6154
6155 // Loop is not unrollable if the loop contains certain instructions.
6156 if (!UCE.canUnroll()) {
6157 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6158 return 1;
6159 }
6160
6161 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6162 << "\n");
6163
6164 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6165 // be able to use it.
6166 int TripCount = 0;
6167 int MaxTripCount = 0;
6168 bool MaxOrZero = false;
6169 unsigned TripMultiple = 0;
6170
6171 bool UseUpperBound = false;
6172 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6173 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6174 UseUpperBound);
6175 unsigned Factor = UP.Count;
6176 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6177
6178 // This function returns 1 to signal to not unroll a loop.
6179 if (Factor == 0)
6180 return 1;
6181 return Factor;
6182}
6183
6184void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6185 int32_t Factor,
6186 CanonicalLoopInfo **UnrolledCLI) {
6187 assert(Factor >= 0 && "Unroll factor must not be negative");
6188
6189 Function *F = Loop->getFunction();
6190 LLVMContext &Ctx = F->getContext();
6191
6192 // If the unrolled loop is not used for another loop-associated directive, it
6193 // is sufficient to add metadata for the LoopUnrollPass.
6194 if (!UnrolledCLI) {
6195 SmallVector<Metadata *, 2> LoopMetadata;
6196 LoopMetadata.push_back(
6197 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6198
6199 if (Factor >= 1) {
6201 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6202 LoopMetadata.push_back(MDNode::get(
6203 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6204 }
6205
6206 addLoopMetadata(Loop, LoopMetadata);
6207 return;
6208 }
6209
6210 // Heuristically determine the unroll factor.
6211 if (Factor == 0)
6213
6214 // No change required with unroll factor 1.
6215 if (Factor == 1) {
6216 *UnrolledCLI = Loop;
6217 return;
6218 }
6219
6220 assert(Factor >= 2 &&
6221 "unrolling only makes sense with a factor of 2 or larger");
6222
6223 Type *IndVarTy = Loop->getIndVarType();
6224
6225 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6226 // unroll the inner loop.
6227 Value *FactorVal =
6228 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6229 /*isSigned=*/false));
6230 std::vector<CanonicalLoopInfo *> LoopNest =
6231 tileLoops(DL, {Loop}, {FactorVal});
6232 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6233 *UnrolledCLI = LoopNest[0];
6234 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6235
6236 // LoopUnrollPass can only fully unroll loops with constant trip count.
6237 // Unroll by the unroll factor with a fallback epilog for the remainder
6238 // iterations if necessary.
6240 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6242 InnerLoop,
6243 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6245 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6246
6247#ifndef NDEBUG
6248 (*UnrolledCLI)->assertOK();
6249#endif
6250}
6251
6252OpenMPIRBuilder::InsertPointTy
6253OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6254 llvm::Value *BufSize, llvm::Value *CpyBuf,
6255 llvm::Value *CpyFn, llvm::Value *DidIt) {
6256 if (!updateToLocation(Loc))
6257 return Loc.IP;
6258
6259 uint32_t SrcLocStrSize;
6260 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6261 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6262 Value *ThreadId = getOrCreateThreadID(Ident);
6263
6264 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6265
6266 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6267
6268 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6269 Builder.CreateCall(Fn, Args);
6270
6271 return Builder.saveIP();
6272}
6273
6274OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6275 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6276 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6278
6279 if (!updateToLocation(Loc))
6280 return Loc.IP;
6281
6282 // If needed allocate and initialize `DidIt` with 0.
6283 // DidIt: flag variable: 1=single thread; 0=not single thread.
6284 llvm::Value *DidIt = nullptr;
6285 if (!CPVars.empty()) {
6286 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6287 Builder.CreateStore(Builder.getInt32(0), DidIt);
6288 }
6289
6290 Directive OMPD = Directive::OMPD_single;
6291 uint32_t SrcLocStrSize;
6292 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6293 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6294 Value *ThreadId = getOrCreateThreadID(Ident);
6295 Value *Args[] = {Ident, ThreadId};
6296
6297 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6298 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6299
6300 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6301 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6302
6303 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6304 if (Error Err = FiniCB(IP))
6305 return Err;
6306
6307 // The thread that executes the single region must set `DidIt` to 1.
6308 // This is used by __kmpc_copyprivate, to know if the caller is the
6309 // single thread or not.
6310 if (DidIt)
6311 Builder.CreateStore(Builder.getInt32(1), DidIt);
6312
6313 return Error::success();
6314 };
6315
6316 // generates the following:
6317 // if (__kmpc_single()) {
6318 // .... single region ...
6319 // __kmpc_end_single
6320 // }
6321 // __kmpc_copyprivate
6322 // __kmpc_barrier
6323
6324 InsertPointOrErrorTy AfterIP =
6325 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6326 /*Conditional*/ true,
6327 /*hasFinalize*/ true);
6328 if (!AfterIP)
6329 return AfterIP.takeError();
6330
6331 if (DidIt) {
6332 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6333 // NOTE BufSize is currently unused, so just pass 0.
6334 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6335 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6336 CPFuncs[I], DidIt);
6337 // NOTE __kmpc_copyprivate already inserts a barrier
6338 } else if (!IsNowait) {
6339 InsertPointOrErrorTy AfterIP =
6340 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6341 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6342 /* CheckCancelFlag */ false);
6343 if (!AfterIP)
6344 return AfterIP.takeError();
6345 }
6346 return Builder.saveIP();
6347}
6348
6349OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6350 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6351 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6352
6353 if (!updateToLocation(Loc))
6354 return Loc.IP;
6355
6356 Directive OMPD = Directive::OMPD_critical;
6357 uint32_t SrcLocStrSize;
6358 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6359 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6360 Value *ThreadId = getOrCreateThreadID(Ident);
6361 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6362 Value *Args[] = {Ident, ThreadId, LockVar};
6363
6364 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6365 Function *RTFn = nullptr;
6366 if (HintInst) {
6367 // Add Hint to entry Args and create call
6368 EnterArgs.push_back(HintInst);
6369 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6370 } else {
6371 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6372 }
6373 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6374
6375 Function *ExitRTLFn =
6376 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6377 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6378
6379 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6380 /*Conditional*/ false, /*hasFinalize*/ true);
6381}
6382
6383OpenMPIRBuilder::InsertPointTy
6384OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6385 InsertPointTy AllocaIP, unsigned NumLoops,
6386 ArrayRef<llvm::Value *> StoreValues,
6387 const Twine &Name, bool IsDependSource) {
6388 assert(
6389 llvm::all_of(StoreValues,
6390 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6391 "OpenMP runtime requires depend vec with i64 type");
6392
6393 if (!updateToLocation(Loc))
6394 return Loc.IP;
6395
6396 // Allocate space for vector and generate alloc instruction.
6397 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6398 Builder.restoreIP(AllocaIP);
6399 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6400 ArgsBase->setAlignment(Align(8));
6401 updateToLocation(Loc);
6402
6403 // Store the index value with offset in depend vector.
6404 for (unsigned I = 0; I < NumLoops; ++I) {
6405 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6406 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6407 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6408 STInst->setAlignment(Align(8));
6409 }
6410
6411 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6412 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6413
6414 uint32_t SrcLocStrSize;
6415 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6416 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6417 Value *ThreadId = getOrCreateThreadID(Ident);
6418 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6419
6420 Function *RTLFn = nullptr;
6421 if (IsDependSource)
6422 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6423 else
6424 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6425 Builder.CreateCall(RTLFn, Args);
6426
6427 return Builder.saveIP();
6428}
6429
6430OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6431 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6432 FinalizeCallbackTy FiniCB, bool IsThreads) {
6433 if (!updateToLocation(Loc))
6434 return Loc.IP;
6435
6436 Directive OMPD = Directive::OMPD_ordered;
6437 Instruction *EntryCall = nullptr;
6438 Instruction *ExitCall = nullptr;
6439
6440 if (IsThreads) {
6441 uint32_t SrcLocStrSize;
6442 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6443 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6444 Value *ThreadId = getOrCreateThreadID(Ident);
6445 Value *Args[] = {Ident, ThreadId};
6446
6447 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6448 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6449
6450 Function *ExitRTLFn =
6451 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6452 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6453 }
6454
6455 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6456 /*Conditional*/ false, /*hasFinalize*/ true);
6457}
6458
6459OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6460 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6461 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6462 bool HasFinalize, bool IsCancellable) {
6463
6464 if (HasFinalize)
6465 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6466
6467 // Create inlined region's entry and body blocks, in preparation
6468 // for conditional creation
6469 BasicBlock *EntryBB = Builder.GetInsertBlock();
6470 Instruction *SplitPos = EntryBB->getTerminator();
6471 if (!isa_and_nonnull<BranchInst>(SplitPos))
6472 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6473 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6474 BasicBlock *FiniBB =
6475 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6476
6477 Builder.SetInsertPoint(EntryBB->getTerminator());
6478 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6479
6480 // generate body
6481 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6482 /* CodeGenIP */ Builder.saveIP()))
6483 return Err;
6484
6485 // emit exit call and do any needed finalization.
6486 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6487 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6488 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6489 "Unexpected control flow graph state!!");
6490 InsertPointOrErrorTy AfterIP =
6491 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6492 if (!AfterIP)
6493 return AfterIP.takeError();
6494 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6495 "Unexpected Control Flow State!");
6497
6498 // If we are skipping the region of a non conditional, remove the exit
6499 // block, and clear the builder's insertion point.
6500 assert(SplitPos->getParent() == ExitBB &&
6501 "Unexpected Insertion point location!");
6502 auto merged = MergeBlockIntoPredecessor(ExitBB);
6503 BasicBlock *ExitPredBB = SplitPos->getParent();
6504 auto InsertBB = merged ? ExitPredBB : ExitBB;
6505 if (!isa_and_nonnull<BranchInst>(SplitPos))
6506 SplitPos->eraseFromParent();
6507 Builder.SetInsertPoint(InsertBB);
6508
6509 return Builder.saveIP();
6510}
6511
6512OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6513 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6514 // if nothing to do, Return current insertion point.
6515 if (!Conditional || !EntryCall)
6516 return Builder.saveIP();
6517
6518 BasicBlock *EntryBB = Builder.GetInsertBlock();
6519 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6520 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6521 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6522
6523 // Emit thenBB and set the Builder's insertion point there for
6524 // body generation next. Place the block after the current block.
6525 Function *CurFn = EntryBB->getParent();
6526 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6527
6528 // Move Entry branch to end of ThenBB, and replace with conditional
6529 // branch (If-stmt)
6530 Instruction *EntryBBTI = EntryBB->getTerminator();
6531 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6532 EntryBBTI->removeFromParent();
6533 Builder.SetInsertPoint(UI);
6534 Builder.Insert(EntryBBTI);
6535 UI->eraseFromParent();
6536 Builder.SetInsertPoint(ThenBB->getTerminator());
6537
6538 // return an insertion point to ExitBB.
6539 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6540}
6541
6542OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6543 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6544 bool HasFinalize) {
6545
6546 Builder.restoreIP(FinIP);
6547
6548 // If there is finalization to do, emit it before the exit call
6549 if (HasFinalize) {
6550 assert(!FinalizationStack.empty() &&
6551 "Unexpected finalization stack state!");
6552
6553 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6554 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6555
6556 if (Error Err = Fi.FiniCB(FinIP))
6557 return Err;
6558
6559 BasicBlock *FiniBB = FinIP.getBlock();
6560 Instruction *FiniBBTI = FiniBB->getTerminator();
6561
6562 // set Builder IP for call creation
6563 Builder.SetInsertPoint(FiniBBTI);
6564 }
6565
6566 if (!ExitCall)
6567 return Builder.saveIP();
6568
6569 // place the Exitcall as last instruction before Finalization block terminator
6570 ExitCall->removeFromParent();
6571 Builder.Insert(ExitCall);
6572
6573 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6574 ExitCall->getIterator());
6575}
6576
6577OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6578 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6579 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6580 if (!IP.isSet())
6581 return IP;
6582
6583 IRBuilder<>::InsertPointGuard IPG(Builder);
6584
6585 // creates the following CFG structure
6586 // OMP_Entry : (MasterAddr != PrivateAddr)?
6587 // F T
6588 // | \
6589 // | copin.not.master
6590 // | /
6591 // v /
6592 // copyin.not.master.end
6593 // |
6594 // v
6595 // OMP.Entry.Next
6596
6597 BasicBlock *OMP_Entry = IP.getBlock();
6598 Function *CurFn = OMP_Entry->getParent();
6599 BasicBlock *CopyBegin =
6600 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6601 BasicBlock *CopyEnd = nullptr;
6602
6603 // If entry block is terminated, split to preserve the branch to following
6604 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6605 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6606 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6607 "copyin.not.master.end");
6608 OMP_Entry->getTerminator()->eraseFromParent();
6609 } else {
6610 CopyEnd =
6611 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6612 }
6613
6614 Builder.SetInsertPoint(OMP_Entry);
6615 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6616 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6617 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6618 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6619
6620 Builder.SetInsertPoint(CopyBegin);
6621 if (BranchtoEnd)
6622 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6623
6624 return Builder.saveIP();
6625}
6626
6627CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6629 std::string Name) {
6630 IRBuilder<>::InsertPointGuard IPG(Builder);
6631 updateToLocation(Loc);
6632
6633 uint32_t SrcLocStrSize;
6634 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6635 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6636 Value *ThreadId = getOrCreateThreadID(Ident);
6637 Value *Args[] = {ThreadId, Size, Allocator};
6638
6639 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6640
6641 return Builder.CreateCall(Fn, Args, Name);
6642}
6643
6644CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6645 Value *Addr, Value *Allocator,
6646 std::string Name) {
6647 IRBuilder<>::InsertPointGuard IPG(Builder);
6648 updateToLocation(Loc);
6649
6650 uint32_t SrcLocStrSize;
6651 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6652 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6653 Value *ThreadId = getOrCreateThreadID(Ident);
6654 Value *Args[] = {ThreadId, Addr, Allocator};
6655 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6656 return Builder.CreateCall(Fn, Args, Name);
6657}
6658
6659CallInst *OpenMPIRBuilder::createOMPInteropInit(
6660 const LocationDescription &Loc, Value *InteropVar,
6661 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6662 Value *DependenceAddress, bool HaveNowaitClause) {
6663 IRBuilder<>::InsertPointGuard IPG(Builder);
6664 updateToLocation(Loc);
6665
6666 uint32_t SrcLocStrSize;
6667 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6668 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6669 Value *ThreadId = getOrCreateThreadID(Ident);
6670 if (Device == nullptr)
6672 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6673 if (NumDependences == nullptr) {
6674 NumDependences = ConstantInt::get(Int32, 0);
6675 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6676 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6677 }
6678 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6679 Value *Args[] = {
6680 Ident, ThreadId, InteropVar, InteropTypeVal,
6681 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6682
6683 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6684
6685 return Builder.CreateCall(Fn, Args);
6686}
6687
6688CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6689 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6690 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6691 IRBuilder<>::InsertPointGuard IPG(Builder);
6692 updateToLocation(Loc);
6693
6694 uint32_t SrcLocStrSize;
6695 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6696 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6697 Value *ThreadId = getOrCreateThreadID(Ident);
6698 if (Device == nullptr)
6700 if (NumDependences == nullptr) {
6701 NumDependences = ConstantInt::get(Int32, 0);
6702 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6703 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6704 }
6705 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6706 Value *Args[] = {
6707 Ident, ThreadId, InteropVar, Device,
6708 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6709
6710 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6711
6712 return Builder.CreateCall(Fn, Args);
6713}
6714
6715CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6716 Value *InteropVar, Value *Device,
6717 Value *NumDependences,
6718 Value *DependenceAddress,
6719 bool HaveNowaitClause) {
6720 IRBuilder<>::InsertPointGuard IPG(Builder);
6721 updateToLocation(Loc);
6722 uint32_t SrcLocStrSize;
6723 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6724 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6725 Value *ThreadId = getOrCreateThreadID(Ident);
6726 if (Device == nullptr)
6728 if (NumDependences == nullptr) {
6729 NumDependences = ConstantInt::get(Int32, 0);
6730 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6731 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6732 }
6733 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6734 Value *Args[] = {
6735 Ident, ThreadId, InteropVar, Device,
6736 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6737
6738 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6739
6740 return Builder.CreateCall(Fn, Args);
6741}
6742
6743CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6744 const LocationDescription &Loc, llvm::Value *Pointer,
6745 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6746 IRBuilder<>::InsertPointGuard IPG(Builder);
6747 updateToLocation(Loc);
6748
6749 uint32_t SrcLocStrSize;
6750 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6751 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6752 Value *ThreadId = getOrCreateThreadID(Ident);
6753 Constant *ThreadPrivateCache =
6754 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6755 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6756
6757 Function *Fn =
6758 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6759
6760 return Builder.CreateCall(Fn, Args);
6761}
6762
6763OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6764 const LocationDescription &Loc,
6765 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6766 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6767 "expected num_threads and num_teams to be specified");
6768
6769 if (!updateToLocation(Loc))
6770 return Loc.IP;
6771
6772 uint32_t SrcLocStrSize;
6773 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6774 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6775 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6776 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6777 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6778 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6779 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6780
6781 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6782 Function *Kernel = DebugKernelWrapper;
6783
6784 // We need to strip the debug prefix to get the correct kernel name.
6785 StringRef KernelName = Kernel->getName();
6786 const std::string DebugPrefix = "_debug__";
6787 if (KernelName.ends_with(DebugPrefix)) {
6788 KernelName = KernelName.drop_back(DebugPrefix.length());
6789 Kernel = M.getFunction(KernelName);
6790 assert(Kernel && "Expected the real kernel to exist");
6791 }
6792
6793 // Manifest the launch configuration in the metadata matching the kernel
6794 // environment.
6795 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6796 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6797
6798 // If MaxThreads not set, select the maximum between the default workgroup
6799 // size and the MinThreads value.
6800 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6801 if (MaxThreadsVal < 0)
6802 MaxThreadsVal = std::max(
6803 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6804
6805 if (MaxThreadsVal > 0)
6806 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6807
6808 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6810 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6811 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6812 Constant *ReductionDataSize =
6813 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6814 Constant *ReductionBufferLength =
6815 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6816
6817 Function *Fn = getOrCreateRuntimeFunctionPtr(
6818 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6819 const DataLayout &DL = Fn->getDataLayout();
6820
6821 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6822 Constant *DynamicEnvironmentInitializer =
6823 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6824 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6825 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6826 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6827 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6828 DL.getDefaultGlobalsAddressSpace());
6829 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6830
6831 Constant *DynamicEnvironment =
6832 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6833 ? DynamicEnvironmentGV
6834 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6835 DynamicEnvironmentPtr);
6836
6837 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6838 ConfigurationEnvironment, {
6839 UseGenericStateMachineVal,
6840 MayUseNestedParallelismVal,
6841 IsSPMDVal,
6842 MinThreads,
6843 MaxThreads,
6844 MinTeams,
6845 MaxTeams,
6846 ReductionDataSize,
6847 ReductionBufferLength,
6848 });
6849 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6850 KernelEnvironment, {
6851 ConfigurationEnvironmentInitializer,
6852 Ident,
6853 DynamicEnvironment,
6854 });
6855 std::string KernelEnvironmentName =
6856 (KernelName + "_kernel_environment").str();
6857 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6858 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6859 KernelEnvironmentInitializer, KernelEnvironmentName,
6860 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6861 DL.getDefaultGlobalsAddressSpace());
6862 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6863
6864 Constant *KernelEnvironment =
6865 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6866 ? KernelEnvironmentGV
6867 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6868 KernelEnvironmentPtr);
6869 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6870 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6871 KernelLaunchEnvironment =
6872 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6873 ? KernelLaunchEnvironment
6874 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6875 KernelLaunchEnvParamTy);
6876 CallInst *ThreadKind =
6877 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6878
6879 Value *ExecUserCode = Builder.CreateICmpEQ(
6880 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6881 "exec_user_code");
6882
6883 // ThreadKind = __kmpc_target_init(...)
6884 // if (ThreadKind == -1)
6885 // user_code
6886 // else
6887 // return;
6888
6889 auto *UI = Builder.CreateUnreachable();
6890 BasicBlock *CheckBB = UI->getParent();
6891 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6892
6893 BasicBlock *WorkerExitBB = BasicBlock::Create(
6894 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6895 Builder.SetInsertPoint(WorkerExitBB);
6896 Builder.CreateRetVoid();
6897
6898 auto *CheckBBTI = CheckBB->getTerminator();
6899 Builder.SetInsertPoint(CheckBBTI);
6900 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6901
6902 CheckBBTI->eraseFromParent();
6903 UI->eraseFromParent();
6904
6905 // Continue in the "user_code" block, see diagram above and in
6906 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6907 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6908}
6909
6910void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6911 int32_t TeamsReductionDataSize,
6912 int32_t TeamsReductionBufferLength) {
6913 if (!updateToLocation(Loc))
6914 return;
6915
6916 Function *Fn = getOrCreateRuntimeFunctionPtr(
6917 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6918
6919 Builder.CreateCall(Fn, {});
6920
6921 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6922 return;
6923
6924 Function *Kernel = Builder.GetInsertBlock()->getParent();
6925 // We need to strip the debug prefix to get the correct kernel name.
6926 StringRef KernelName = Kernel->getName();
6927 const std::string DebugPrefix = "_debug__";
6928 if (KernelName.ends_with(DebugPrefix))
6929 KernelName = KernelName.drop_back(DebugPrefix.length());
6930 auto *KernelEnvironmentGV =
6931 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6932 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6933 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6934 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6935 KernelEnvironmentInitializer,
6936 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6937 NewInitializer = ConstantFoldInsertValueInstruction(
6938 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6939 {0, 8});
6940 KernelEnvironmentGV->setInitializer(NewInitializer);
6941}
6942
6943static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6944 bool Min) {
6945 if (Kernel.hasFnAttribute(Name)) {
6946 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6947 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6948 }
6949 Kernel.addFnAttr(Name, llvm::utostr(Value));
6950}
6951
6952std::pair<int32_t, int32_t>
6953OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6954 int32_t ThreadLimit =
6955 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6956
6957 if (T.isAMDGPU()) {
6958 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6959 if (!Attr.isValid() || !Attr.isStringAttribute())
6960 return {0, ThreadLimit};
6961 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6962 int32_t LB, UB;
6963 if (!llvm::to_integer(UBStr, UB, 10))
6964 return {0, ThreadLimit};
6965 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6966 if (!llvm::to_integer(LBStr, LB, 10))
6967 return {0, UB};
6968 return {LB, UB};
6969 }
6970
6971 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6972 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6973 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6974 }
6975 return {0, ThreadLimit};
6976}
6977
6978void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6979 Function &Kernel, int32_t LB,
6980 int32_t UB) {
6981 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6982
6983 if (T.isAMDGPU()) {
6984 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6985 llvm::utostr(LB) + "," + llvm::utostr(UB));
6986 return;
6987 }
6988
6989 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6990}
6991
6992std::pair<int32_t, int32_t>
6993OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
6994 // TODO: Read from backend annotations if available.
6995 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6996}
6997
6998void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
6999 int32_t LB, int32_t UB) {
7000 if (T.isNVPTX())
7001 if (UB > 0)
7002 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7003 if (T.isAMDGPU())
7004 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7005
7006 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7007}
7008
7009void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7010 Function *OutlinedFn) {
7011 if (Config.isTargetDevice()) {
7013 // TODO: Determine if DSO local can be set to true.
7014 OutlinedFn->setDSOLocal(false);
7016 if (T.isAMDGCN())
7018 else if (T.isNVPTX())
7020 else if (T.isSPIRV())
7022 }
7023}
7024
7025Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7026 StringRef EntryFnIDName) {
7027 if (Config.isTargetDevice()) {
7028 assert(OutlinedFn && "The outlined function must exist if embedded");
7029 return OutlinedFn;
7030 }
7031
7032 return new GlobalVariable(
7033 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7034 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7035}
7036
7037Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7038 StringRef EntryFnName) {
7039 if (OutlinedFn)
7040 return OutlinedFn;
7041
7042 assert(!M.getGlobalVariable(EntryFnName, true) &&
7043 "Named kernel already exists?");
7044 return new GlobalVariable(
7045 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7046 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7047}
7048
7049Error OpenMPIRBuilder::emitTargetRegionFunction(
7050 TargetRegionEntryInfo &EntryInfo,
7051 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7052 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7053
7054 SmallString<64> EntryFnName;
7055 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7056
7057 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7058 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7059 if (!CBResult)
7060 return CBResult.takeError();
7061 OutlinedFn = *CBResult;
7062 } else {
7063 OutlinedFn = nullptr;
7064 }
7065
7066 // If this target outline function is not an offload entry, we don't need to
7067 // register it. This may be in the case of a false if clause, or if there are
7068 // no OpenMP targets.
7069 if (!IsOffloadEntry)
7070 return Error::success();
7071
7072 std::string EntryFnIDName =
7073 Config.isTargetDevice()
7074 ? std::string(EntryFnName)
7075 : createPlatformSpecificName({EntryFnName, "region_id"});
7076
7077 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7078 EntryFnName, EntryFnIDName);
7079 return Error::success();
7080}
7081
7082Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7083 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7084 StringRef EntryFnName, StringRef EntryFnIDName) {
7085 if (OutlinedFn)
7086 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7087 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7088 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7089 OffloadInfoManager.registerTargetRegionEntryInfo(
7090 EntryInfo, EntryAddr, OutlinedFnID,
7091 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7092 return OutlinedFnID;
7093}
7094
7095OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7096 const LocationDescription &Loc, InsertPointTy AllocaIP,
7097 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7098 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7099 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7100 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7101 BodyGenTy BodyGenType)>
7102 BodyGenCB,
7103 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7104 if (!updateToLocation(Loc))
7105 return InsertPointTy();
7106
7107 Builder.restoreIP(CodeGenIP);
7108 // Disable TargetData CodeGen on Device pass.
7109 if (Config.IsTargetDevice.value_or(false)) {
7110 if (BodyGenCB) {
7111 InsertPointOrErrorTy AfterIP =
7112 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7113 if (!AfterIP)
7114 return AfterIP.takeError();
7115 Builder.restoreIP(*AfterIP);
7116 }
7117 return Builder.saveIP();
7118 }
7119
7120 bool IsStandAlone = !BodyGenCB;
7121 MapInfosTy *MapInfo;
7122 // Generate the code for the opening of the data environment. Capture all the
7123 // arguments of the runtime call by reference because they are used in the
7124 // closing of the region.
7125 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7126 InsertPointTy CodeGenIP) -> Error {
7127 MapInfo = &GenMapInfoCB(Builder.saveIP());
7128 if (Error Err = emitOffloadingArrays(
7129 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7130 /*IsNonContiguous=*/true, DeviceAddrCB))
7131 return Err;
7132
7133 TargetDataRTArgs RTArgs;
7134 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7135
7136 // Emit the number of elements in the offloading arrays.
7137 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7138
7139 // Source location for the ident struct
7140 if (!SrcLocInfo) {
7141 uint32_t SrcLocStrSize;
7142 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7143 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7144 }
7145
7146 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7147 SrcLocInfo, DeviceID,
7148 PointerNum, RTArgs.BasePointersArray,
7149 RTArgs.PointersArray, RTArgs.SizesArray,
7150 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7151 RTArgs.MappersArray};
7152
7153 if (IsStandAlone) {
7154 assert(MapperFunc && "MapperFunc missing for standalone target data");
7155
7156 auto TaskBodyCB = [&](Value *, Value *,
7158 if (Info.HasNoWait) {
7159 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7163 }
7164
7165 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7166 OffloadingArgs);
7167
7168 if (Info.HasNoWait) {
7169 BasicBlock *OffloadContBlock =
7170 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7171 Function *CurFn = Builder.GetInsertBlock()->getParent();
7172 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7173 Builder.restoreIP(Builder.saveIP());
7174 }
7175 return Error::success();
7176 };
7177
7178 bool RequiresOuterTargetTask = Info.HasNoWait;
7179 if (!RequiresOuterTargetTask)
7180 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7181 /*TargetTaskAllocaIP=*/{}));
7182 else
7183 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7184 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7185 } else {
7186 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7187 omp::OMPRTL___tgt_target_data_begin_mapper);
7188
7189 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7190
7191 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7192 if (isa<AllocaInst>(DeviceMap.second.second)) {
7193 auto *LI =
7194 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7195 Builder.CreateStore(LI, DeviceMap.second.second);
7196 }
7197 }
7198
7199 // If device pointer privatization is required, emit the body of the
7200 // region here. It will have to be duplicated: with and without
7201 // privatization.
7202 InsertPointOrErrorTy AfterIP =
7203 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7204 if (!AfterIP)
7205 return AfterIP.takeError();
7206 Builder.restoreIP(*AfterIP);
7207 }
7208 return Error::success();
7209 };
7210
7211 // If we need device pointer privatization, we need to emit the body of the
7212 // region with no privatization in the 'else' branch of the conditional.
7213 // Otherwise, we don't have to do anything.
7214 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7215 InsertPointTy CodeGenIP) -> Error {
7216 InsertPointOrErrorTy AfterIP =
7217 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7218 if (!AfterIP)
7219 return AfterIP.takeError();
7220 Builder.restoreIP(*AfterIP);
7221 return Error::success();
7222 };
7223
7224 // Generate code for the closing of the data region.
7225 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7226 TargetDataRTArgs RTArgs;
7227 Info.EmitDebug = !MapInfo->Names.empty();
7228 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7229
7230 // Emit the number of elements in the offloading arrays.
7231 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7232
7233 // Source location for the ident struct
7234 if (!SrcLocInfo) {
7235 uint32_t SrcLocStrSize;
7236 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7237 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7238 }
7239
7240 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7241 PointerNum, RTArgs.BasePointersArray,
7242 RTArgs.PointersArray, RTArgs.SizesArray,
7243 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7244 RTArgs.MappersArray};
7245 Function *EndMapperFunc =
7246 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7247
7248 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7249 return Error::success();
7250 };
7251
7252 // We don't have to do anything to close the region if the if clause evaluates
7253 // to false.
7254 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7255 return Error::success();
7256 };
7257
7258 Error Err = [&]() -> Error {
7259 if (BodyGenCB) {
7260 Error Err = [&]() {
7261 if (IfCond)
7262 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7263 return BeginThenGen(AllocaIP, Builder.saveIP());
7264 }();
7265
7266 if (Err)
7267 return Err;
7268
7269 // If we don't require privatization of device pointers, we emit the body
7270 // in between the runtime calls. This avoids duplicating the body code.
7271 InsertPointOrErrorTy AfterIP =
7272 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7273 if (!AfterIP)
7274 return AfterIP.takeError();
7275 restoreIPandDebugLoc(Builder, *AfterIP);
7276
7277 if (IfCond)
7278 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7279 return EndThenGen(AllocaIP, Builder.saveIP());
7280 }
7281 if (IfCond)
7282 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7283 return BeginThenGen(AllocaIP, Builder.saveIP());
7284 }();
7285
7286 if (Err)
7287 return Err;
7288
7289 return Builder.saveIP();
7290}
7291
7293OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7294 bool IsGPUDistribute) {
7295 assert((IVSize == 32 || IVSize == 64) &&
7296 "IV size is not compatible with the omp runtime");
7298 if (IsGPUDistribute)
7299 Name = IVSize == 32
7300 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7301 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7302 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7303 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7304 else
7305 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7306 : omp::OMPRTL___kmpc_for_static_init_4u)
7307 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7308 : omp::OMPRTL___kmpc_for_static_init_8u);
7309
7310 return getOrCreateRuntimeFunction(M, Name);
7311}
7312
7313FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7314 bool IVSigned) {
7315 assert((IVSize == 32 || IVSize == 64) &&
7316 "IV size is not compatible with the omp runtime");
7317 RuntimeFunction Name = IVSize == 32
7318 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7319 : omp::OMPRTL___kmpc_dispatch_init_4u)
7320 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7321 : omp::OMPRTL___kmpc_dispatch_init_8u);
7322
7323 return getOrCreateRuntimeFunction(M, Name);
7324}
7325
7326FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7327 bool IVSigned) {
7328 assert((IVSize == 32 || IVSize == 64) &&
7329 "IV size is not compatible with the omp runtime");
7330 RuntimeFunction Name = IVSize == 32
7331 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7332 : omp::OMPRTL___kmpc_dispatch_next_4u)
7333 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7334 : omp::OMPRTL___kmpc_dispatch_next_8u);
7335
7336 return getOrCreateRuntimeFunction(M, Name);
7337}
7338
7339FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7340 bool IVSigned) {
7341 assert((IVSize == 32 || IVSize == 64) &&
7342 "IV size is not compatible with the omp runtime");
7343 RuntimeFunction Name = IVSize == 32
7344 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7345 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7346 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7347 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7348
7349 return getOrCreateRuntimeFunction(M, Name);
7350}
7351
7352FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7353 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7354}
7355
7357 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7358 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7359
7360 DISubprogram *NewSP = Func->getSubprogram();
7361 if (!NewSP)
7362 return;
7363
7365
7366 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7367 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7368 // Only use cached variable if the arg number matches. This is important
7369 // so that DIVariable created for privatized variables are not discarded.
7370 if (NewVar && (arg == NewVar->getArg()))
7371 return NewVar;
7372
7374 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7375 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7376 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7377 return NewVar;
7378 };
7379
7380 auto UpdateDebugRecord = [&](auto *DR) {
7381 DILocalVariable *OldVar = DR->getVariable();
7382 unsigned ArgNo = 0;
7383 for (auto Loc : DR->location_ops()) {
7384 auto Iter = ValueReplacementMap.find(Loc);
7385 if (Iter != ValueReplacementMap.end()) {
7386 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7387 ArgNo = std::get<1>(Iter->second) + 1;
7388 }
7389 }
7390 if (ArgNo != 0)
7391 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7392 };
7393
7394 // The location and scope of variable intrinsics and records still point to
7395 // the parent function of the target region. Update them.
7396 for (Instruction &I : instructions(Func)) {
7398 "Unexpected debug intrinsic");
7399 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7400 UpdateDebugRecord(&DVR);
7401 }
7402 // An extra argument is passed to the device. Create the debug data for it.
7403 if (OMPBuilder.Config.isTargetDevice()) {
7404 DICompileUnit *CU = NewSP->getUnit();
7405 Module *M = Func->getParent();
7406 DIBuilder DB(*M, true, CU);
7407 DIType *VoidPtrTy =
7408 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7409 DILocalVariable *Var = DB.createParameterVariable(
7410 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7411 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7412 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7413 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7414 &(*Func->begin()));
7415 }
7416}
7417
7419 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7420 return cast<Operator>(V)->getOperand(0);
7421 return V;
7422}
7423
7425 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7426 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7427 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7428 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7429 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7430 SmallVector<Type *> ParameterTypes;
7431 if (OMPBuilder.Config.isTargetDevice()) {
7432 // Add the "implicit" runtime argument we use to provide launch specific
7433 // information for target devices.
7434 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7435 ParameterTypes.push_back(Int8PtrTy);
7436
7437 // All parameters to target devices are passed as pointers
7438 // or i64. This assumes 64-bit address spaces/pointers.
7439 for (auto &Arg : Inputs)
7440 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7441 ? Arg->getType()
7442 : Type::getInt64Ty(Builder.getContext()));
7443 } else {
7444 for (auto &Arg : Inputs)
7445 ParameterTypes.push_back(Arg->getType());
7446 }
7447
7448 auto BB = Builder.GetInsertBlock();
7449 auto M = BB->getModule();
7450 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7451 /*isVarArg*/ false);
7452 auto Func =
7453 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7454
7455 // Forward target-cpu and target-features function attributes from the
7456 // original function to the new outlined function.
7457 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7458
7459 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7460 if (TargetCpuAttr.isStringAttribute())
7461 Func->addFnAttr(TargetCpuAttr);
7462
7463 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7464 if (TargetFeaturesAttr.isStringAttribute())
7465 Func->addFnAttr(TargetFeaturesAttr);
7466
7467 if (OMPBuilder.Config.isTargetDevice()) {
7468 Value *ExecMode =
7469 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7470 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7471 }
7472
7473 // Save insert point.
7474 IRBuilder<>::InsertPointGuard IPG(Builder);
7475 // We will generate the entries in the outlined function but the debug
7476 // location may still be pointing to the parent function. Reset it now.
7477 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7478
7479 // Generate the region into the function.
7480 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7481 Builder.SetInsertPoint(EntryBB);
7482
7483 // Insert target init call in the device compilation pass.
7484 if (OMPBuilder.Config.isTargetDevice())
7485 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7486
7487 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7488
7489 // As we embed the user code in the middle of our target region after we
7490 // generate entry code, we must move what allocas we can into the entry
7491 // block to avoid possible breaking optimisations for device
7492 if (OMPBuilder.Config.isTargetDevice())
7493 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7494
7495 // Insert target deinit call in the device compilation pass.
7496 BasicBlock *OutlinedBodyBB =
7497 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7498 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7499 Builder.saveIP(),
7500 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7501 if (!AfterIP)
7502 return AfterIP.takeError();
7503 Builder.restoreIP(*AfterIP);
7504 if (OMPBuilder.Config.isTargetDevice())
7505 OMPBuilder.createTargetDeinit(Builder);
7506
7507 // Insert return instruction.
7508 Builder.CreateRetVoid();
7509
7510 // New Alloca IP at entry point of created device function.
7511 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7512 auto AllocaIP = Builder.saveIP();
7513
7514 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7515
7516 // Skip the artificial dyn_ptr on the device.
7517 const auto &ArgRange =
7518 OMPBuilder.Config.isTargetDevice()
7519 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7520 : Func->args();
7521
7523
7524 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7525 // Things like GEP's can come in the form of Constants. Constants and
7526 // ConstantExpr's do not have access to the knowledge of what they're
7527 // contained in, so we must dig a little to find an instruction so we
7528 // can tell if they're used inside of the function we're outlining. We
7529 // also replace the original constant expression with a new instruction
7530 // equivalent; an instruction as it allows easy modification in the
7531 // following loop, as we can now know the constant (instruction) is
7532 // owned by our target function and replaceUsesOfWith can now be invoked
7533 // on it (cannot do this with constants it seems). A brand new one also
7534 // allows us to be cautious as it is perhaps possible the old expression
7535 // was used inside of the function but exists and is used externally
7536 // (unlikely by the nature of a Constant, but still).
7537 // NOTE: We cannot remove dead constants that have been rewritten to
7538 // instructions at this stage, we run the risk of breaking later lowering
7539 // by doing so as we could still be in the process of lowering the module
7540 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7541 // constants we have created rewritten versions of.
7542 if (auto *Const = dyn_cast<Constant>(Input))
7543 convertUsersOfConstantsToInstructions(Const, Func, false);
7544
7545 // Collect users before iterating over them to avoid invalidating the
7546 // iteration in case a user uses Input more than once (e.g. a call
7547 // instruction).
7548 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7549 // Collect all the instructions
7551 if (auto *Instr = dyn_cast<Instruction>(User))
7552 if (Instr->getFunction() == Func)
7553 Instr->replaceUsesOfWith(Input, InputCopy);
7554 };
7555
7556 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7557
7558 // Rewrite uses of input valus to parameters.
7559 for (auto InArg : zip(Inputs, ArgRange)) {
7560 Value *Input = std::get<0>(InArg);
7561 Argument &Arg = std::get<1>(InArg);
7562 Value *InputCopy = nullptr;
7563
7564 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7565 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7566 if (!AfterIP)
7567 return AfterIP.takeError();
7568 Builder.restoreIP(*AfterIP);
7569 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7570
7571 // In certain cases a Global may be set up for replacement, however, this
7572 // Global may be used in multiple arguments to the kernel, just segmented
7573 // apart, for example, if we have a global array, that is sectioned into
7574 // multiple mappings (technically not legal in OpenMP, but there is a case
7575 // in Fortran for Common Blocks where this is neccesary), we will end up
7576 // with GEP's into this array inside the kernel, that refer to the Global
7577 // but are technically seperate arguments to the kernel for all intents and
7578 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7579 // index, it will fold into an referal to the Global, if we then encounter
7580 // this folded GEP during replacement all of the references to the
7581 // Global in the kernel will be replaced with the argument we have generated
7582 // that corresponds to it, including any other GEP's that refer to the
7583 // Global that may be other arguments. This will invalidate all of the other
7584 // preceding mapped arguments that refer to the same global that may be
7585 // seperate segments. To prevent this, we defer global processing until all
7586 // other processing has been performed.
7589 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7590 continue;
7591 }
7592
7594 continue;
7595
7596 ReplaceValue(Input, InputCopy, Func);
7597 }
7598
7599 // Replace all of our deferred Input values, currently just Globals.
7600 for (auto Deferred : DeferredReplacement)
7601 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7602
7603 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7604 ValueReplacementMap);
7605 return Func;
7606}
7607/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7608/// of pointers containing shared data between the parent task and the created
7609/// task.
7610static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7611 IRBuilderBase &Builder,
7612 Value *TaskWithPrivates,
7613 Type *TaskWithPrivatesTy) {
7614
7615 Type *TaskTy = OMPIRBuilder.Task;
7616 LLVMContext &Ctx = Builder.getContext();
7617 Value *TaskT =
7618 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7619 Value *Shareds = TaskT;
7620 // TaskWithPrivatesTy can be one of the following
7621 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7622 // %struct.privates }
7623 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7624 //
7625 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7626 // its first member has to be the task descriptor. TaskTy is the type of the
7627 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7628 // first member of TaskT, gives us the pointer to shared data.
7629 if (TaskWithPrivatesTy != TaskTy)
7630 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7631 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7632}
7633/// Create an entry point for a target task with the following.
7634/// It'll have the following signature
7635/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7636/// This function is called from emitTargetTask once the
7637/// code to launch the target kernel has been outlined already.
7638/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7639/// into the task structure so that the deferred target task can access this
7640/// data even after the stack frame of the generating task has been rolled
7641/// back. Offloading arrays contain base pointers, pointers, sizes etc
7642/// of the data that the target kernel will access. These in effect are the
7643/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7645 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7646 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7647 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7648
7649 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7650 // This is because PrivatesTy is the type of the structure in which
7651 // we pass the offloading arrays to the deferred target task.
7652 assert((!NumOffloadingArrays || PrivatesTy) &&
7653 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7654 "to privatize");
7655
7656 Module &M = OMPBuilder.M;
7657 // KernelLaunchFunction is the target launch function, i.e.
7658 // the function that sets up kernel arguments and calls
7659 // __tgt_target_kernel to launch the kernel on the device.
7660 //
7661 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7662
7663 // StaleCI is the CallInst which is the call to the outlined
7664 // target kernel launch function. If there are local live-in values
7665 // that the outlined function uses then these are aggregated into a structure
7666 // which is passed as the second argument. If there are no local live-in
7667 // values or if all values used by the outlined kernel are global variables,
7668 // then there's only one argument, the threadID. So, StaleCI can be
7669 //
7670 // %structArg = alloca { ptr, ptr }, align 8
7671 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7672 // store ptr %20, ptr %gep_, align 8
7673 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7674 // store ptr %21, ptr %gep_8, align 8
7675 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7676 //
7677 // OR
7678 //
7679 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7680 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7681 StaleCI->getIterator());
7682
7683 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7684
7685 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7686 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7687 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7688
7689 auto ProxyFnTy =
7690 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7691 /* isVarArg */ false);
7692 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7693 ".omp_target_task_proxy_func",
7694 Builder.GetInsertBlock()->getModule());
7695 Value *ThreadId = ProxyFn->getArg(0);
7696 Value *TaskWithPrivates = ProxyFn->getArg(1);
7697 ThreadId->setName("thread.id");
7698 TaskWithPrivates->setName("task");
7699
7700 bool HasShareds = SharedArgsOperandNo > 0;
7701 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7702 BasicBlock *EntryBB =
7703 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7704 Builder.SetInsertPoint(EntryBB);
7705
7706 SmallVector<Value *> KernelLaunchArgs;
7707 KernelLaunchArgs.reserve(StaleCI->arg_size());
7708 KernelLaunchArgs.push_back(ThreadId);
7709
7710 if (HasOffloadingArrays) {
7711 assert(TaskTy != TaskWithPrivatesTy &&
7712 "If there are offloading arrays to pass to the target"
7713 "TaskTy cannot be the same as TaskWithPrivatesTy");
7714 (void)TaskTy;
7715 Value *Privates =
7716 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7717 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7718 KernelLaunchArgs.push_back(
7719 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7720 }
7721
7722 if (HasShareds) {
7723 auto *ArgStructAlloca =
7724 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7725 assert(ArgStructAlloca &&
7726 "Unable to find the alloca instruction corresponding to arguments "
7727 "for extracted function");
7728 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7729
7730 AllocaInst *NewArgStructAlloca =
7731 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7732
7733 Value *SharedsSize =
7734 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7735
7737 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7738
7739 Builder.CreateMemCpy(
7740 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7741 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7742 KernelLaunchArgs.push_back(NewArgStructAlloca);
7743 }
7744 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7745 Builder.CreateRetVoid();
7746 return ProxyFn;
7747}
7749
7750 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7751 return GEP->getSourceElementType();
7752 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7753 return Alloca->getAllocatedType();
7754
7755 llvm_unreachable("Unhandled Instruction type");
7756 return nullptr;
7757}
7758// This function returns a struct that has at most two members.
7759// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7760// descriptor. The second member, if needed, is a struct containing arrays
7761// that need to be passed to the offloaded target kernel. For example,
7762// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7763// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7764// respectively, then the types created by this function are
7765//
7766// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7767// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7768// %struct.privates }
7769// %struct.task_with_privates is returned by this function.
7770// If there aren't any offloading arrays to pass to the target kernel,
7771// %struct.kmp_task_ompbuilder_t is returned.
7772static StructType *
7773createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7774 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7775
7776 if (OffloadingArraysToPrivatize.empty())
7777 return OMPIRBuilder.Task;
7778
7779 SmallVector<Type *, 4> StructFieldTypes;
7780 for (Value *V : OffloadingArraysToPrivatize) {
7781 assert(V->getType()->isPointerTy() &&
7782 "Expected pointer to array to privatize. Got a non-pointer value "
7783 "instead");
7784 Type *ArrayTy = getOffloadingArrayType(V);
7785 assert(ArrayTy && "ArrayType cannot be nullptr");
7786 StructFieldTypes.push_back(ArrayTy);
7787 }
7788 StructType *PrivatesStructTy =
7789 StructType::create(StructFieldTypes, "struct.privates");
7790 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7791 "struct.task_with_privates");
7792}
7794 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7795 TargetRegionEntryInfo &EntryInfo,
7796 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7797 Function *&OutlinedFn, Constant *&OutlinedFnID,
7799 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7800 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7801
7802 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7803 [&](StringRef EntryFnName) {
7804 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7805 EntryFnName, Inputs, CBFunc,
7806 ArgAccessorFuncCB);
7807 };
7808
7809 return OMPBuilder.emitTargetRegionFunction(
7810 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7811 OutlinedFnID);
7812}
7813
7814OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7815 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7816 OpenMPIRBuilder::InsertPointTy AllocaIP,
7818 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7819
7820 // The following explains the code-gen scenario for the `target` directive. A
7821 // similar scneario is followed for other device-related directives (e.g.
7822 // `target enter data`) but in similar fashion since we only need to emit task
7823 // that encapsulates the proper runtime call.
7824 //
7825 // When we arrive at this function, the target region itself has been
7826 // outlined into the function OutlinedFn.
7827 // So at ths point, for
7828 // --------------------------------------------------------------
7829 // void user_code_that_offloads(...) {
7830 // omp target depend(..) map(from:a) map(to:b) private(i)
7831 // do i = 1, 10
7832 // a(i) = b(i) + n
7833 // }
7834 //
7835 // --------------------------------------------------------------
7836 //
7837 // we have
7838 //
7839 // --------------------------------------------------------------
7840 //
7841 // void user_code_that_offloads(...) {
7842 // %.offload_baseptrs = alloca [2 x ptr], align 8
7843 // %.offload_ptrs = alloca [2 x ptr], align 8
7844 // %.offload_mappers = alloca [2 x ptr], align 8
7845 // ;; target region has been outlined and now we need to
7846 // ;; offload to it via a target task.
7847 // }
7848 // void outlined_device_function(ptr a, ptr b, ptr n) {
7849 // n = *n_ptr;
7850 // do i = 1, 10
7851 // a(i) = b(i) + n
7852 // }
7853 //
7854 // We have to now do the following
7855 // (i) Make an offloading call to outlined_device_function using the OpenMP
7856 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7857 // emitted by emitKernelLaunch
7858 // (ii) Create a task entry point function that calls kernel_launch_function
7859 // and is the entry point for the target task. See
7860 // '@.omp_target_task_proxy_func in the pseudocode below.
7861 // (iii) Create a task with the task entry point created in (ii)
7862 //
7863 // That is we create the following
7864 // struct task_with_privates {
7865 // struct kmp_task_ompbuilder_t task_struct;
7866 // struct privates {
7867 // [2 x ptr] ; baseptrs
7868 // [2 x ptr] ; ptrs
7869 // [2 x i64] ; sizes
7870 // }
7871 // }
7872 // void user_code_that_offloads(...) {
7873 // %.offload_baseptrs = alloca [2 x ptr], align 8
7874 // %.offload_ptrs = alloca [2 x ptr], align 8
7875 // %.offload_sizes = alloca [2 x i64], align 8
7876 //
7877 // %structArg = alloca { ptr, ptr, ptr }, align 8
7878 // %strucArg[0] = a
7879 // %strucArg[1] = b
7880 // %strucArg[2] = &n
7881 //
7882 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7883 // sizeof(kmp_task_ompbuilder_t),
7884 // sizeof(structArg),
7885 // @.omp_target_task_proxy_func,
7886 // ...)
7887 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7888 // sizeof(structArg))
7889 // memcpy(target_task_with_privates->privates->baseptrs,
7890 // offload_baseptrs, sizeof(offload_baseptrs)
7891 // memcpy(target_task_with_privates->privates->ptrs,
7892 // offload_ptrs, sizeof(offload_ptrs)
7893 // memcpy(target_task_with_privates->privates->sizes,
7894 // offload_sizes, sizeof(offload_sizes)
7895 // dependencies_array = ...
7896 // ;; if nowait not present
7897 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7898 // call @__kmpc_omp_task_begin_if0(...)
7899 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7900 // %target_task_with_privates)
7901 // call @__kmpc_omp_task_complete_if0(...)
7902 // }
7903 //
7904 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7905 // ptr %task) {
7906 // %structArg = alloca {ptr, ptr, ptr}
7907 // %task_ptr = getelementptr(%task, 0, 0)
7908 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7909 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7910 //
7911 // %offloading_arrays = getelementptr(%task, 0, 1)
7912 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7913 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7914 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7915 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7916 // %offload_sizes, %structArg)
7917 // }
7918 //
7919 // We need the proxy function because the signature of the task entry point
7920 // expected by kmpc_omp_task is always the same and will be different from
7921 // that of the kernel_launch function.
7922 //
7923 // kernel_launch_function is generated by emitKernelLaunch and has the
7924 // always_inline attribute. For this example, it'll look like so:
7925 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7926 // %offload_sizes, %structArg) alwaysinline {
7927 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7928 // ; load aggregated data from %structArg
7929 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7930 // ; offload_sizes
7931 // call i32 @__tgt_target_kernel(...,
7932 // outlined_device_function,
7933 // ptr %kernel_args)
7934 // }
7935 // void outlined_device_function(ptr a, ptr b, ptr n) {
7936 // n = *n_ptr;
7937 // do i = 1, 10
7938 // a(i) = b(i) + n
7939 // }
7940 //
7941 BasicBlock *TargetTaskBodyBB =
7942 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7943 BasicBlock *TargetTaskAllocaBB =
7944 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7945
7946 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7947 TargetTaskAllocaBB->begin());
7948 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7949
7950 OutlineInfo OI;
7951 OI.EntryBB = TargetTaskAllocaBB;
7952 OI.OuterAllocaBB = AllocaIP.getBlock();
7953
7954 // Add the thread ID argument.
7956 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7957 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7958
7959 // Generate the task body which will subsequently be outlined.
7960 Builder.restoreIP(TargetTaskBodyIP);
7961 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7962 return Err;
7963
7964 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7965 // it is given. These blocks are enumerated by
7966 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7967 // to be outside the region. In other words, OI.ExitBlock is expected to be
7968 // the start of the region after the outlining. We used to set OI.ExitBlock
7969 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7970 // except when the task body is a single basic block. In that case,
7971 // OI.ExitBlock is set to the single task body block and will get left out of
7972 // the outlining process. So, simply create a new empty block to which we
7973 // uncoditionally branch from where TaskBodyCB left off
7974 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7975 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7976 /*IsFinished=*/true);
7977
7978 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7979 bool NeedsTargetTask = HasNoWait && DeviceID;
7980 if (NeedsTargetTask) {
7981 for (auto *V :
7982 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7983 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7984 RTArgs.SizesArray}) {
7986 OffloadingArraysToPrivatize.push_back(V);
7987 OI.ExcludeArgsFromAggregate.push_back(V);
7988 }
7989 }
7990 }
7991 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7992 DeviceID, OffloadingArraysToPrivatize](
7993 Function &OutlinedFn) mutable {
7994 assert(OutlinedFn.hasOneUse() &&
7995 "there must be a single user for the outlined function");
7996
7997 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
7998
7999 // The first argument of StaleCI is always the thread id.
8000 // The next few arguments are the pointers to offloading arrays
8001 // if any. (see OffloadingArraysToPrivatize)
8002 // Finally, all other local values that are live-in into the outlined region
8003 // end up in a structure whose pointer is passed as the last argument. This
8004 // piece of data is passed in the "shared" field of the task structure. So,
8005 // we know we have to pass shareds to the task if the number of arguments is
8006 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8007 // thread id. Further, for safety, we assert that the number of arguments of
8008 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8009 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8010 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8011 assert((!HasShareds ||
8012 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8013 "Wrong number of arguments for StaleCI when shareds are present");
8014 int SharedArgOperandNo =
8015 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8016
8017 StructType *TaskWithPrivatesTy =
8018 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8019 StructType *PrivatesTy = nullptr;
8020
8021 if (!OffloadingArraysToPrivatize.empty())
8022 PrivatesTy =
8023 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8024
8026 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8027 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8028
8029 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8030 << "\n");
8031
8032 Builder.SetInsertPoint(StaleCI);
8033
8034 // Gather the arguments for emitting the runtime call.
8035 uint32_t SrcLocStrSize;
8036 Constant *SrcLocStr =
8037 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8038 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8039
8040 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8041 //
8042 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8043 // the DeviceID to the deferred task and also since
8044 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8045 Function *TaskAllocFn =
8046 !NeedsTargetTask
8047 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8048 : getOrCreateRuntimeFunctionPtr(
8049 OMPRTL___kmpc_omp_target_task_alloc);
8050
8051 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8052 // call.
8053 Value *ThreadID = getOrCreateThreadID(Ident);
8054
8055 // Argument - `sizeof_kmp_task_t` (TaskSize)
8056 // Tasksize refers to the size in bytes of kmp_task_t data structure
8057 // plus any other data to be passed to the target task, if any, which
8058 // is packed into a struct. kmp_task_t and the struct so created are
8059 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8060 Value *TaskSize = Builder.getInt64(
8061 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8062
8063 // Argument - `sizeof_shareds` (SharedsSize)
8064 // SharedsSize refers to the shareds array size in the kmp_task_t data
8065 // structure.
8066 Value *SharedsSize = Builder.getInt64(0);
8067 if (HasShareds) {
8068 auto *ArgStructAlloca =
8069 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8070 assert(ArgStructAlloca &&
8071 "Unable to find the alloca instruction corresponding to arguments "
8072 "for extracted function");
8073 auto *ArgStructType =
8074 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8075 assert(ArgStructType && "Unable to find struct type corresponding to "
8076 "arguments for extracted function");
8077 SharedsSize =
8078 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8079 }
8080
8081 // Argument - `flags`
8082 // Task is tied iff (Flags & 1) == 1.
8083 // Task is untied iff (Flags & 1) == 0.
8084 // Task is final iff (Flags & 2) == 2.
8085 // Task is not final iff (Flags & 2) == 0.
8086 // A target task is not final and is untied.
8087 Value *Flags = Builder.getInt32(0);
8088
8089 // Emit the @__kmpc_omp_task_alloc runtime call
8090 // The runtime call returns a pointer to an area where the task captured
8091 // variables must be copied before the task is run (TaskData)
8092 CallInst *TaskData = nullptr;
8093
8094 SmallVector<llvm::Value *> TaskAllocArgs = {
8095 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8096 /*flags=*/Flags,
8097 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8098 /*task_func=*/ProxyFn};
8099
8100 if (NeedsTargetTask) {
8101 assert(DeviceID && "Expected non-empty device ID.");
8102 TaskAllocArgs.push_back(DeviceID);
8103 }
8104
8105 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8106
8107 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8108 if (HasShareds) {
8109 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8111 *this, Builder, TaskData, TaskWithPrivatesTy);
8112 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8113 SharedsSize);
8114 }
8115 if (!OffloadingArraysToPrivatize.empty()) {
8116 Value *Privates =
8117 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8118 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8119 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8120 [[maybe_unused]] Type *ArrayType =
8121 getOffloadingArrayType(PtrToPrivatize);
8122 assert(ArrayType && "ArrayType cannot be nullptr");
8123
8124 Type *ElementType = PrivatesTy->getElementType(i);
8125 assert(ElementType == ArrayType &&
8126 "ElementType should match ArrayType");
8127 (void)ArrayType;
8128
8129 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8130 Builder.CreateMemCpy(
8131 Dst, Alignment, PtrToPrivatize, Alignment,
8132 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8133 }
8134 }
8135
8136 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8137
8138 // ---------------------------------------------------------------
8139 // V5.2 13.8 target construct
8140 // If the nowait clause is present, execution of the target task
8141 // may be deferred. If the nowait clause is not present, the target task is
8142 // an included task.
8143 // ---------------------------------------------------------------
8144 // The above means that the lack of a nowait on the target construct
8145 // translates to '#pragma omp task if(0)'
8146 if (!NeedsTargetTask) {
8147 if (DepArray) {
8148 Function *TaskWaitFn =
8149 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8150 Builder.CreateCall(
8151 TaskWaitFn,
8152 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8153 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8154 /*dep_list=*/DepArray,
8155 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8156 /*noalias_dep_list=*/
8158 }
8159 // Included task.
8160 Function *TaskBeginFn =
8161 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8162 Function *TaskCompleteFn =
8163 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8164 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8165 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8166 CI->setDebugLoc(StaleCI->getDebugLoc());
8167 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8168 } else if (DepArray) {
8169 // HasNoWait - meaning the task may be deferred. Call
8170 // __kmpc_omp_task_with_deps if there are dependencies,
8171 // else call __kmpc_omp_task
8172 Function *TaskFn =
8173 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8174 Builder.CreateCall(
8175 TaskFn,
8176 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8177 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8179 } else {
8180 // Emit the @__kmpc_omp_task runtime call to spawn the task
8181 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8182 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8183 }
8184
8185 StaleCI->eraseFromParent();
8186 for (Instruction *I : llvm::reverse(ToBeDeleted))
8187 I->eraseFromParent();
8188 };
8189 addOutlineInfo(std::move(OI));
8190
8191 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8192 << *(Builder.GetInsertBlock()) << "\n");
8193 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8194 << *(Builder.GetInsertBlock()->getParent()->getParent())
8195 << "\n");
8196 return Builder.saveIP();
8197}
8198
8199Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8200 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8201 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8202 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8203 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8204 if (Error Err =
8205 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8206 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8207 return Err;
8208 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8209 return Error::success();
8210}
8211
8212static void emitTargetCall(
8213 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8214 OpenMPIRBuilder::InsertPointTy AllocaIP,
8215 OpenMPIRBuilder::TargetDataInfo &Info,
8216 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8217 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8218 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8220 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8221 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8223 bool HasNoWait) {
8224 // Generate a function call to the host fallback implementation of the target
8225 // region. This is called by the host when no offload entry was generated for
8226 // the target region and when the offloading call fails at runtime.
8227 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8228 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8229 Builder.restoreIP(IP);
8230 Builder.CreateCall(OutlinedFn, Args);
8231 return Builder.saveIP();
8232 };
8233
8234 bool HasDependencies = Dependencies.size() > 0;
8235 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8236
8237 OpenMPIRBuilder::TargetKernelArgs KArgs;
8238
8239 auto TaskBodyCB =
8240 [&](Value *DeviceID, Value *RTLoc,
8241 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8242 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8243 // produce any.
8244 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8245 // emitKernelLaunch makes the necessary runtime call to offload the
8246 // kernel. We then outline all that code into a separate function
8247 // ('kernel_launch_function' in the pseudo code above). This function is
8248 // then called by the target task proxy function (see
8249 // '@.omp_target_task_proxy_func' in the pseudo code above)
8250 // "@.omp_target_task_proxy_func' is generated by
8251 // emitTargetTaskProxyFunction.
8252 if (OutlinedFnID && DeviceID)
8253 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8254 EmitTargetCallFallbackCB, KArgs,
8255 DeviceID, RTLoc, TargetTaskAllocaIP);
8256
8257 // We only need to do the outlining if `DeviceID` is set to avoid calling
8258 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8259 // generating the `else` branch of an `if` clause.
8260 //
8261 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8262 // In this case, we execute the host implementation directly.
8263 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8264 }());
8265
8266 OMPBuilder.Builder.restoreIP(AfterIP);
8267 return Error::success();
8268 };
8269
8270 auto &&EmitTargetCallElse =
8271 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8272 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8273 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8274 // produce any.
8275 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8276 if (RequiresOuterTargetTask) {
8277 // Arguments that are intended to be directly forwarded to an
8278 // emitKernelLaunch call are pased as nullptr, since
8279 // OutlinedFnID=nullptr results in that call not being done.
8280 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8281 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8282 /*RTLoc=*/nullptr, AllocaIP,
8283 Dependencies, EmptyRTArgs, HasNoWait);
8284 }
8285 return EmitTargetCallFallbackCB(Builder.saveIP());
8286 }());
8287
8288 Builder.restoreIP(AfterIP);
8289 return Error::success();
8290 };
8291
8292 auto &&EmitTargetCallThen =
8293 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8294 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8295 Info.HasNoWait = HasNoWait;
8296 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8297 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8298 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8299 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8300 /*IsNonContiguous=*/true,
8301 /*ForEndCall=*/false))
8302 return Err;
8303
8304 SmallVector<Value *, 3> NumTeamsC;
8305 for (auto [DefaultVal, RuntimeVal] :
8306 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8307 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8308 : Builder.getInt32(DefaultVal));
8309
8310 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8311 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8312 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8313 if (Clause)
8314 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8315 /*isSigned=*/false);
8316 return Clause;
8317 };
8318 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8319 if (Clause)
8320 Result =
8321 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8322 Result, Clause)
8323 : Clause;
8324 };
8325
8326 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8327 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8328 SmallVector<Value *, 3> NumThreadsC;
8329 Value *MaxThreadsClause =
8330 RuntimeAttrs.TeamsThreadLimit.size() == 1
8331 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8332 : nullptr;
8333
8334 for (auto [TeamsVal, TargetVal] : zip_equal(
8335 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8336 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8337 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8338
8339 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8340 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8341
8342 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8343 }
8344
8345 unsigned NumTargetItems = Info.NumberOfPtrs;
8346 // TODO: Use correct device ID
8347 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8348 uint32_t SrcLocStrSize;
8349 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8350 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8351 llvm::omp::IdentFlag(0), 0);
8352
8353 Value *TripCount = RuntimeAttrs.LoopTripCount
8354 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8355 Builder.getInt64Ty(),
8356 /*isSigned=*/false)
8357 : Builder.getInt64(0);
8358
8359 // TODO: Use correct DynCGGroupMem
8360 Value *DynCGGroupMem = Builder.getInt32(0);
8361
8362 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8363 NumTeamsC, NumThreadsC,
8364 DynCGGroupMem, HasNoWait);
8365
8366 // Assume no error was returned because TaskBodyCB and
8367 // EmitTargetCallFallbackCB don't produce any.
8368 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8369 // The presence of certain clauses on the target directive require the
8370 // explicit generation of the target task.
8371 if (RequiresOuterTargetTask)
8372 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8373 Dependencies, KArgs.RTArgs,
8374 Info.HasNoWait);
8375
8376 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8377 EmitTargetCallFallbackCB, KArgs,
8378 DeviceID, RTLoc, AllocaIP);
8379 }());
8380
8381 Builder.restoreIP(AfterIP);
8382 return Error::success();
8383 };
8384
8385 // If we don't have an ID for the target region, it means an offload entry
8386 // wasn't created. In this case we just run the host fallback directly and
8387 // ignore any potential 'if' clauses.
8388 if (!OutlinedFnID) {
8389 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8390 return;
8391 }
8392
8393 // If there's no 'if' clause, only generate the kernel launch code path.
8394 if (!IfCond) {
8395 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8396 return;
8397 }
8398
8399 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8400 EmitTargetCallElse, AllocaIP));
8401}
8402
8403OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8404 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8405 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8406 TargetRegionEntryInfo &EntryInfo,
8407 const TargetKernelDefaultAttrs &DefaultAttrs,
8408 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8409 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8410 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8411 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8412 CustomMapperCallbackTy CustomMapperCB,
8413 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8414
8415 if (!updateToLocation(Loc))
8416 return InsertPointTy();
8417
8418 Builder.restoreIP(CodeGenIP);
8419
8420 Function *OutlinedFn;
8421 Constant *OutlinedFnID = nullptr;
8422 // The target region is outlined into its own function. The LLVM IR for
8423 // the target region itself is generated using the callbacks CBFunc
8424 // and ArgAccessorFuncCB
8426 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8427 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8428 return Err;
8429
8430 // If we are not on the target device, then we need to generate code
8431 // to make a remote call (offload) to the previously outlined function
8432 // that represents the target region. Do that now.
8433 if (!Config.isTargetDevice())
8434 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8435 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8436 CustomMapperCB, Dependencies, HasNowait);
8437 return Builder.saveIP();
8438}
8439
8440std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8441 StringRef FirstSeparator,
8442 StringRef Separator) {
8443 SmallString<128> Buffer;
8444 llvm::raw_svector_ostream OS(Buffer);
8445 StringRef Sep = FirstSeparator;
8446 for (StringRef Part : Parts) {
8447 OS << Sep << Part;
8448 Sep = Separator;
8449 }
8450 return OS.str().str();
8451}
8452
8453std::string
8454OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8455 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8456 Config.separator());
8457}
8458
8460OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
8461 unsigned AddressSpace) {
8462 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8463 if (Elem.second) {
8464 assert(Elem.second->getValueType() == Ty &&
8465 "OMP internal variable has different type than requested");
8466 } else {
8467 // TODO: investigate the appropriate linkage type used for the global
8468 // variable for possibly changing that to internal or private, or maybe
8469 // create different versions of the function for different OMP internal
8470 // variables.
8471 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8474 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8475 Constant::getNullValue(Ty), Elem.first(),
8476 /*InsertBefore=*/nullptr,
8478 const DataLayout &DL = M.getDataLayout();
8479 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8480 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8481 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8482 Elem.second = GV;
8483 }
8484
8485 return Elem.second;
8486}
8487
8488Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8489 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8490 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8491 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8492}
8493
8494Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8495 LLVMContext &Ctx = Builder.getContext();
8496 Value *Null =
8498 Value *SizeGep =
8499 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8500 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8501 return SizePtrToInt;
8502}
8503
8505OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8506 std::string VarName) {
8507 llvm::Constant *MaptypesArrayInit =
8508 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8509 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8510 M, MaptypesArrayInit->getType(),
8511 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8512 VarName);
8513 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8514 return MaptypesArrayGlobal;
8515}
8516
8517void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8518 InsertPointTy AllocaIP,
8519 unsigned NumOperands,
8520 struct MapperAllocas &MapperAllocas) {
8521 if (!updateToLocation(Loc))
8522 return;
8523
8524 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8525 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8526 Builder.restoreIP(AllocaIP);
8527 AllocaInst *ArgsBase = Builder.CreateAlloca(
8528 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8529 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8530 ".offload_ptrs");
8531 AllocaInst *ArgSizes = Builder.CreateAlloca(
8532 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8533 updateToLocation(Loc);
8534 MapperAllocas.ArgsBase = ArgsBase;
8535 MapperAllocas.Args = Args;
8536 MapperAllocas.ArgSizes = ArgSizes;
8537}
8538
8539void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8540 Function *MapperFunc, Value *SrcLocInfo,
8541 Value *MaptypesArg, Value *MapnamesArg,
8542 struct MapperAllocas &MapperAllocas,
8543 int64_t DeviceID, unsigned NumOperands) {
8544 if (!updateToLocation(Loc))
8545 return;
8546
8547 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8548 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8549 Value *ArgsBaseGEP =
8550 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8551 {Builder.getInt32(0), Builder.getInt32(0)});
8552 Value *ArgsGEP =
8553 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8554 {Builder.getInt32(0), Builder.getInt32(0)});
8555 Value *ArgSizesGEP =
8556 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8557 {Builder.getInt32(0), Builder.getInt32(0)});
8558 Value *NullPtr =
8559 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8560 Builder.CreateCall(MapperFunc,
8561 {SrcLocInfo, Builder.getInt64(DeviceID),
8562 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8563 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8564}
8565
8566void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8567 TargetDataRTArgs &RTArgs,
8568 TargetDataInfo &Info,
8569 bool ForEndCall) {
8570 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8571 "expected region end call to runtime only when end call is separate");
8572 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8573 auto VoidPtrTy = UnqualPtrTy;
8574 auto VoidPtrPtrTy = UnqualPtrTy;
8575 auto Int64Ty = Type::getInt64Ty(M.getContext());
8576 auto Int64PtrTy = UnqualPtrTy;
8577
8578 if (!Info.NumberOfPtrs) {
8579 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8580 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8581 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8582 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8583 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8584 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8585 return;
8586 }
8587
8588 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8589 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8590 Info.RTArgs.BasePointersArray,
8591 /*Idx0=*/0, /*Idx1=*/0);
8592 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8593 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8594 /*Idx0=*/0,
8595 /*Idx1=*/0);
8596 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8597 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8598 /*Idx0=*/0, /*Idx1=*/0);
8599 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8600 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8601 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8602 : Info.RTArgs.MapTypesArray,
8603 /*Idx0=*/0,
8604 /*Idx1=*/0);
8605
8606 // Only emit the mapper information arrays if debug information is
8607 // requested.
8608 if (!Info.EmitDebug)
8609 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8610 else
8611 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8612 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8613 /*Idx0=*/0,
8614 /*Idx1=*/0);
8615 // If there is no user-defined mapper, set the mapper array to nullptr to
8616 // avoid an unnecessary data privatization
8617 if (!Info.HasMapper)
8618 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8619 else
8620 RTArgs.MappersArray =
8621 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8622}
8623
8624void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8625 InsertPointTy CodeGenIP,
8626 MapInfosTy &CombinedInfo,
8627 TargetDataInfo &Info) {
8628 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8629 CombinedInfo.NonContigInfo;
8630
8631 // Build an array of struct descriptor_dim and then assign it to
8632 // offload_args.
8633 //
8634 // struct descriptor_dim {
8635 // uint64_t offset;
8636 // uint64_t count;
8637 // uint64_t stride
8638 // };
8639 Type *Int64Ty = Builder.getInt64Ty();
8641 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8642 "struct.descriptor_dim");
8643
8644 enum { OffsetFD = 0, CountFD, StrideFD };
8645 // We need two index variable here since the size of "Dims" is the same as
8646 // the size of Components, however, the size of offset, count, and stride is
8647 // equal to the size of base declaration that is non-contiguous.
8648 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8649 // Skip emitting ir if dimension size is 1 since it cannot be
8650 // non-contiguous.
8651 if (NonContigInfo.Dims[I] == 1)
8652 continue;
8653 Builder.restoreIP(AllocaIP);
8654 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8655 AllocaInst *DimsAddr =
8656 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8657 Builder.restoreIP(CodeGenIP);
8658 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8659 unsigned RevIdx = EE - II - 1;
8660 Value *DimsLVal = Builder.CreateInBoundsGEP(
8661 DimsAddr->getAllocatedType(), DimsAddr,
8662 {Builder.getInt64(0), Builder.getInt64(II)});
8663 // Offset
8664 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8665 Builder.CreateAlignedStore(
8666 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8667 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8668 // Count
8669 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8670 Builder.CreateAlignedStore(
8671 NonContigInfo.Counts[L][RevIdx], CountLVal,
8672 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8673 // Stride
8674 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8675 Builder.CreateAlignedStore(
8676 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8677 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8678 }
8679 // args[I] = &dims
8680 Builder.restoreIP(CodeGenIP);
8681 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8682 DimsAddr, Builder.getPtrTy());
8683 Value *P = Builder.CreateConstInBoundsGEP2_32(
8684 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8685 Info.RTArgs.PointersArray, 0, I);
8686 Builder.CreateAlignedStore(
8687 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8688 ++L;
8689 }
8690}
8691
8692void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8693 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8694 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8695 BasicBlock *ExitBB, bool IsInit) {
8696 StringRef Prefix = IsInit ? ".init" : ".del";
8697
8698 // Evaluate if this is an array section.
8700 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8701 Value *IsArray =
8702 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8703 Value *DeleteBit = Builder.CreateAnd(
8704 MapType,
8705 Builder.getInt64(
8706 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8707 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8708 Value *DeleteCond;
8709 Value *Cond;
8710 if (IsInit) {
8711 // base != begin?
8712 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8713 // IsPtrAndObj?
8714 Value *PtrAndObjBit = Builder.CreateAnd(
8715 MapType,
8716 Builder.getInt64(
8717 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8718 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8719 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8720 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8721 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8722 DeleteCond = Builder.CreateIsNull(
8723 DeleteBit,
8724 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8725 } else {
8726 Cond = IsArray;
8727 DeleteCond = Builder.CreateIsNotNull(
8728 DeleteBit,
8729 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8730 }
8731 Cond = Builder.CreateAnd(Cond, DeleteCond);
8732 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8733
8734 emitBlock(BodyBB, MapperFn);
8735 // Get the array size by multiplying element size and element number (i.e., \p
8736 // Size).
8737 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8738 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8739 // memory allocation/deletion purpose only.
8740 Value *MapTypeArg = Builder.CreateAnd(
8741 MapType,
8742 Builder.getInt64(
8743 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8744 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8745 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8746 MapTypeArg = Builder.CreateOr(
8747 MapTypeArg,
8748 Builder.getInt64(
8749 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8750 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8751
8752 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8753 // data structure.
8754 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8755 ArraySize, MapTypeArg, MapName};
8756 Builder.CreateCall(
8757 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8758 OffloadingArgs);
8759}
8760
8761Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8762 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8763 llvm::Value *BeginArg)>
8764 GenMapInfoCB,
8765 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8766 SmallVector<Type *> Params;
8767 Params.emplace_back(Builder.getPtrTy());
8768 Params.emplace_back(Builder.getPtrTy());
8769 Params.emplace_back(Builder.getPtrTy());
8770 Params.emplace_back(Builder.getInt64Ty());
8771 Params.emplace_back(Builder.getInt64Ty());
8772 Params.emplace_back(Builder.getPtrTy());
8773
8774 auto *FnTy =
8775 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8776
8777 SmallString<64> TyStr;
8778 raw_svector_ostream Out(TyStr);
8779 Function *MapperFn =
8781 MapperFn->addFnAttr(Attribute::NoInline);
8782 MapperFn->addFnAttr(Attribute::NoUnwind);
8783 MapperFn->addParamAttr(0, Attribute::NoUndef);
8784 MapperFn->addParamAttr(1, Attribute::NoUndef);
8785 MapperFn->addParamAttr(2, Attribute::NoUndef);
8786 MapperFn->addParamAttr(3, Attribute::NoUndef);
8787 MapperFn->addParamAttr(4, Attribute::NoUndef);
8788 MapperFn->addParamAttr(5, Attribute::NoUndef);
8789
8790 // Start the mapper function code generation.
8791 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8792 auto SavedIP = Builder.saveIP();
8793 Builder.SetInsertPoint(EntryBB);
8794
8795 Value *MapperHandle = MapperFn->getArg(0);
8796 Value *BaseIn = MapperFn->getArg(1);
8797 Value *BeginIn = MapperFn->getArg(2);
8798 Value *Size = MapperFn->getArg(3);
8799 Value *MapType = MapperFn->getArg(4);
8800 Value *MapName = MapperFn->getArg(5);
8801
8802 // Compute the starting and end addresses of array elements.
8803 // Prepare common arguments for array initiation and deletion.
8804 // Convert the size in bytes into the number of array elements.
8805 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8806 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8807 Value *PtrBegin = BeginIn;
8808 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8809
8810 // Emit array initiation if this is an array section and \p MapType indicates
8811 // that memory allocation is required.
8812 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8813 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8814 MapType, MapName, ElementSize, HeadBB,
8815 /*IsInit=*/true);
8816
8817 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8818
8819 // Emit the loop header block.
8820 emitBlock(HeadBB, MapperFn);
8821 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8822 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8823 // Evaluate whether the initial condition is satisfied.
8824 Value *IsEmpty =
8825 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8826 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8827
8828 // Emit the loop body block.
8829 emitBlock(BodyBB, MapperFn);
8830 BasicBlock *LastBB = BodyBB;
8831 PHINode *PtrPHI =
8832 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8833 PtrPHI->addIncoming(PtrBegin, HeadBB);
8834
8835 // Get map clause information. Fill up the arrays with all mapped variables.
8836 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8837 if (!Info)
8838 return Info.takeError();
8839
8840 // Call the runtime API __tgt_mapper_num_components to get the number of
8841 // pre-existing components.
8842 Value *OffloadingArgs[] = {MapperHandle};
8843 Value *PreviousSize = Builder.CreateCall(
8844 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8845 OffloadingArgs);
8846 Value *ShiftedPreviousSize =
8847 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8848
8849 // Fill up the runtime mapper handle for all components.
8850 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8851 Value *CurBaseArg = Info->BasePointers[I];
8852 Value *CurBeginArg = Info->Pointers[I];
8853 Value *CurSizeArg = Info->Sizes[I];
8854 Value *CurNameArg = Info->Names.size()
8855 ? Info->Names[I]
8856 : Constant::getNullValue(Builder.getPtrTy());
8857
8858 // Extract the MEMBER_OF field from the map type.
8859 Value *OriMapType = Builder.getInt64(
8860 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8861 Info->Types[I]));
8862 Value *MemberMapType =
8863 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8864
8865 // Combine the map type inherited from user-defined mapper with that
8866 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8867 // bits of the \a MapType, which is the input argument of the mapper
8868 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8869 // bits of MemberMapType.
8870 // [OpenMP 5.0], 1.2.6. map-type decay.
8871 // | alloc | to | from | tofrom | release | delete
8872 // ----------------------------------------------------------
8873 // alloc | alloc | alloc | alloc | alloc | release | delete
8874 // to | alloc | to | alloc | to | release | delete
8875 // from | alloc | alloc | from | from | release | delete
8876 // tofrom | alloc | to | from | tofrom | release | delete
8877 Value *LeftToFrom = Builder.CreateAnd(
8878 MapType,
8879 Builder.getInt64(
8880 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8881 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8882 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8883 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8884 BasicBlock *AllocElseBB =
8885 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8886 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8887 BasicBlock *ToElseBB =
8888 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8889 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8890 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8891 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8892 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8893 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8894 emitBlock(AllocBB, MapperFn);
8895 Value *AllocMapType = Builder.CreateAnd(
8896 MemberMapType,
8897 Builder.getInt64(
8898 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8899 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8900 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8901 Builder.CreateBr(EndBB);
8902 emitBlock(AllocElseBB, MapperFn);
8903 Value *IsTo = Builder.CreateICmpEQ(
8904 LeftToFrom,
8905 Builder.getInt64(
8906 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8907 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8908 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8909 // In case of to, clear OMP_MAP_FROM.
8910 emitBlock(ToBB, MapperFn);
8911 Value *ToMapType = Builder.CreateAnd(
8912 MemberMapType,
8913 Builder.getInt64(
8914 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8915 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8916 Builder.CreateBr(EndBB);
8917 emitBlock(ToElseBB, MapperFn);
8918 Value *IsFrom = Builder.CreateICmpEQ(
8919 LeftToFrom,
8920 Builder.getInt64(
8921 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8922 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8923 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8924 // In case of from, clear OMP_MAP_TO.
8925 emitBlock(FromBB, MapperFn);
8926 Value *FromMapType = Builder.CreateAnd(
8927 MemberMapType,
8928 Builder.getInt64(
8929 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8930 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8931 // In case of tofrom, do nothing.
8932 emitBlock(EndBB, MapperFn);
8933 LastBB = EndBB;
8934 PHINode *CurMapType =
8935 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8936 CurMapType->addIncoming(AllocMapType, AllocBB);
8937 CurMapType->addIncoming(ToMapType, ToBB);
8938 CurMapType->addIncoming(FromMapType, FromBB);
8939 CurMapType->addIncoming(MemberMapType, ToElseBB);
8940
8941 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8942 CurSizeArg, CurMapType, CurNameArg};
8943
8944 auto ChildMapperFn = CustomMapperCB(I);
8945 if (!ChildMapperFn)
8946 return ChildMapperFn.takeError();
8947 if (*ChildMapperFn) {
8948 // Call the corresponding mapper function.
8949 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8950 } else {
8951 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8952 // data structure.
8953 Builder.CreateCall(
8954 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8955 OffloadingArgs);
8956 }
8957 }
8958
8959 // Update the pointer to point to the next element that needs to be mapped,
8960 // and check whether we have mapped all elements.
8961 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8962 "omp.arraymap.next");
8963 PtrPHI->addIncoming(PtrNext, LastBB);
8964 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8965 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8966 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8967
8968 emitBlock(ExitBB, MapperFn);
8969 // Emit array deletion if this is an array section and \p MapType indicates
8970 // that deletion is required.
8971 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8972 MapType, MapName, ElementSize, DoneBB,
8973 /*IsInit=*/false);
8974
8975 // Emit the function exit block.
8976 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8977
8978 Builder.CreateRetVoid();
8979 Builder.restoreIP(SavedIP);
8980 return MapperFn;
8981}
8982
8983Error OpenMPIRBuilder::emitOffloadingArrays(
8984 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8985 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8986 bool IsNonContiguous,
8987 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8988
8989 // Reset the array information.
8990 Info.clearArrayInfo();
8991 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8992
8993 if (Info.NumberOfPtrs == 0)
8994 return Error::success();
8995
8996 Builder.restoreIP(AllocaIP);
8997 // Detect if we have any capture size requiring runtime evaluation of the
8998 // size so that a constant array could be eventually used.
8999 ArrayType *PointerArrayType =
9000 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9001
9002 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9003 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9004
9005 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9006 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9007 AllocaInst *MappersArray = Builder.CreateAlloca(
9008 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9009 Info.RTArgs.MappersArray = MappersArray;
9010
9011 // If we don't have any VLA types or other types that require runtime
9012 // evaluation, we can use a constant array for the map sizes, otherwise we
9013 // need to fill up the arrays as we do for the pointers.
9014 Type *Int64Ty = Builder.getInt64Ty();
9015 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9016 ConstantInt::get(Int64Ty, 0));
9017 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9018 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9019 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9020 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9021 if (IsNonContiguous &&
9022 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9023 CombinedInfo.Types[I] &
9024 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9025 ConstSizes[I] =
9026 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9027 else
9028 ConstSizes[I] = CI;
9029 continue;
9030 }
9031 }
9032 RuntimeSizes.set(I);
9033 }
9034
9035 if (RuntimeSizes.all()) {
9036 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9037 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9038 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9039 restoreIPandDebugLoc(Builder, CodeGenIP);
9040 } else {
9041 auto *SizesArrayInit = ConstantArray::get(
9042 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9043 std::string Name = createPlatformSpecificName({"offload_sizes"});
9044 auto *SizesArrayGbl =
9045 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9046 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9047 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9048
9049 if (!RuntimeSizes.any()) {
9050 Info.RTArgs.SizesArray = SizesArrayGbl;
9051 } else {
9052 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9053 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9054 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9055 AllocaInst *Buffer = Builder.CreateAlloca(
9056 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9057 Buffer->setAlignment(OffloadSizeAlign);
9058 restoreIPandDebugLoc(Builder, CodeGenIP);
9059 Builder.CreateMemCpy(
9060 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9061 SizesArrayGbl, OffloadSizeAlign,
9062 Builder.getIntN(
9063 IndexSize,
9064 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9065
9066 Info.RTArgs.SizesArray = Buffer;
9067 }
9068 restoreIPandDebugLoc(Builder, CodeGenIP);
9069 }
9070
9071 // The map types are always constant so we don't need to generate code to
9072 // fill arrays. Instead, we create an array constant.
9074 for (auto mapFlag : CombinedInfo.Types)
9075 Mapping.push_back(
9076 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9077 mapFlag));
9078 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9079 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9080 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9081
9082 // The information types are only built if provided.
9083 if (!CombinedInfo.Names.empty()) {
9084 auto *MapNamesArrayGbl = createOffloadMapnames(
9085 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9086 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9087 Info.EmitDebug = true;
9088 } else {
9089 Info.RTArgs.MapNamesArray =
9090 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9091 Info.EmitDebug = false;
9092 }
9093
9094 // If there's a present map type modifier, it must not be applied to the end
9095 // of a region, so generate a separate map type array in that case.
9096 if (Info.separateBeginEndCalls()) {
9097 bool EndMapTypesDiffer = false;
9098 for (uint64_t &Type : Mapping) {
9099 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9100 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9101 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9102 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9103 EndMapTypesDiffer = true;
9104 }
9105 }
9106 if (EndMapTypesDiffer) {
9107 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9108 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9109 }
9110 }
9111
9112 PointerType *PtrTy = Builder.getPtrTy();
9113 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9114 Value *BPVal = CombinedInfo.BasePointers[I];
9115 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9116 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9117 0, I);
9118 Builder.CreateAlignedStore(BPVal, BP,
9119 M.getDataLayout().getPrefTypeAlign(PtrTy));
9120
9121 if (Info.requiresDevicePointerInfo()) {
9122 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9123 CodeGenIP = Builder.saveIP();
9124 Builder.restoreIP(AllocaIP);
9125 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9126 Builder.restoreIP(CodeGenIP);
9127 if (DeviceAddrCB)
9128 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9129 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9130 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9131 if (DeviceAddrCB)
9132 DeviceAddrCB(I, BP);
9133 }
9134 }
9135
9136 Value *PVal = CombinedInfo.Pointers[I];
9137 Value *P = Builder.CreateConstInBoundsGEP2_32(
9138 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9139 I);
9140 // TODO: Check alignment correct.
9141 Builder.CreateAlignedStore(PVal, P,
9142 M.getDataLayout().getPrefTypeAlign(PtrTy));
9143
9144 if (RuntimeSizes.test(I)) {
9145 Value *S = Builder.CreateConstInBoundsGEP2_32(
9146 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9147 /*Idx0=*/0,
9148 /*Idx1=*/I);
9149 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9150 Int64Ty,
9151 /*isSigned=*/true),
9152 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9153 }
9154 // Fill up the mapper array.
9155 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9156 Value *MFunc = ConstantPointerNull::get(PtrTy);
9157
9158 auto CustomMFunc = CustomMapperCB(I);
9159 if (!CustomMFunc)
9160 return CustomMFunc.takeError();
9161 if (*CustomMFunc)
9162 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9163
9164 Value *MAddr = Builder.CreateInBoundsGEP(
9165 MappersArray->getAllocatedType(), MappersArray,
9166 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9167 Builder.CreateAlignedStore(
9168 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9169 }
9170
9171 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9172 Info.NumberOfPtrs == 0)
9173 return Error::success();
9174 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9175 return Error::success();
9176}
9177
9178void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9179 BasicBlock *CurBB = Builder.GetInsertBlock();
9180
9181 if (!CurBB || CurBB->getTerminator()) {
9182 // If there is no insert point or the previous block is already
9183 // terminated, don't touch it.
9184 } else {
9185 // Otherwise, create a fall-through branch.
9186 Builder.CreateBr(Target);
9187 }
9188
9189 Builder.ClearInsertionPoint();
9190}
9191
9192void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9193 bool IsFinished) {
9194 BasicBlock *CurBB = Builder.GetInsertBlock();
9195
9196 // Fall out of the current block (if necessary).
9197 emitBranch(BB);
9198
9199 if (IsFinished && BB->use_empty()) {
9200 BB->eraseFromParent();
9201 return;
9202 }
9203
9204 // Place the block after the current block, if possible, or else at
9205 // the end of the function.
9206 if (CurBB && CurBB->getParent())
9207 CurFn->insert(std::next(CurBB->getIterator()), BB);
9208 else
9209 CurFn->insert(CurFn->end(), BB);
9210 Builder.SetInsertPoint(BB);
9211}
9212
9213Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9214 BodyGenCallbackTy ElseGen,
9215 InsertPointTy AllocaIP) {
9216 // If the condition constant folds and can be elided, try to avoid emitting
9217 // the condition and the dead arm of the if/else.
9218 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9219 auto CondConstant = CI->getSExtValue();
9220 if (CondConstant)
9221 return ThenGen(AllocaIP, Builder.saveIP());
9222
9223 return ElseGen(AllocaIP, Builder.saveIP());
9224 }
9225
9226 Function *CurFn = Builder.GetInsertBlock()->getParent();
9227
9228 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9229 // emit the conditional branch.
9230 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9231 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9232 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9233 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9234 // Emit the 'then' code.
9235 emitBlock(ThenBlock, CurFn);
9236 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9237 return Err;
9238 emitBranch(ContBlock);
9239 // Emit the 'else' code if present.
9240 // There is no need to emit line number for unconditional branch.
9241 emitBlock(ElseBlock, CurFn);
9242 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9243 return Err;
9244 // There is no need to emit line number for unconditional branch.
9245 emitBranch(ContBlock);
9246 // Emit the continuation block for code after the if.
9247 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9248 return Error::success();
9249}
9250
9251bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9252 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9255 "Unexpected Atomic Ordering.");
9256
9257 bool Flush = false;
9259
9260 switch (AK) {
9261 case Read:
9264 FlushAO = AtomicOrdering::Acquire;
9265 Flush = true;
9266 }
9267 break;
9268 case Write:
9269 case Compare:
9270 case Update:
9273 FlushAO = AtomicOrdering::Release;
9274 Flush = true;
9275 }
9276 break;
9277 case Capture:
9278 switch (AO) {
9280 FlushAO = AtomicOrdering::Acquire;
9281 Flush = true;
9282 break;
9284 FlushAO = AtomicOrdering::Release;
9285 Flush = true;
9286 break;
9290 Flush = true;
9291 break;
9292 default:
9293 // do nothing - leave silently.
9294 break;
9295 }
9296 }
9297
9298 if (Flush) {
9299 // Currently Flush RT call still doesn't take memory_ordering, so for when
9300 // that happens, this tries to do the resolution of which atomic ordering
9301 // to use with but issue the flush call
9302 // TODO: pass `FlushAO` after memory ordering support is added
9303 (void)FlushAO;
9304 emitFlush(Loc);
9305 }
9306
9307 // for AO == AtomicOrdering::Monotonic and all other case combinations
9308 // do nothing
9309 return Flush;
9310}
9311
9312OpenMPIRBuilder::InsertPointTy
9313OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9314 AtomicOpValue &X, AtomicOpValue &V,
9315 AtomicOrdering AO, InsertPointTy AllocaIP) {
9316 if (!updateToLocation(Loc))
9317 return Loc.IP;
9318
9319 assert(X.Var->getType()->isPointerTy() &&
9320 "OMP Atomic expects a pointer to target memory");
9321 Type *XElemTy = X.ElemTy;
9322 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9323 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9324 "OMP atomic read expected a scalar type");
9325
9326 Value *XRead = nullptr;
9327
9328 if (XElemTy->isIntegerTy()) {
9329 LoadInst *XLD =
9330 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9331 XLD->setAtomic(AO);
9332 XRead = cast<Value>(XLD);
9333 } else if (XElemTy->isStructTy()) {
9334 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9335 // target does not support `atomicrmw` of the size of the struct
9336 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9337 OldVal->setAtomic(AO);
9338 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9339 unsigned LoadSize =
9340 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9341 OpenMPIRBuilder::AtomicInfo atomicInfo(
9342 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9343 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9344 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9345 XRead = AtomicLoadRes.first;
9346 OldVal->eraseFromParent();
9347 } else {
9348 // We need to perform atomic op as integer
9349 IntegerType *IntCastTy =
9350 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9351 LoadInst *XLoad =
9352 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9353 XLoad->setAtomic(AO);
9354 if (XElemTy->isFloatingPointTy()) {
9355 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9356 } else {
9357 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9358 }
9359 }
9360 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9361 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9362 return Builder.saveIP();
9363}
9364
9365OpenMPIRBuilder::InsertPointTy
9366OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9367 AtomicOpValue &X, Value *Expr,
9368 AtomicOrdering AO, InsertPointTy AllocaIP) {
9369 if (!updateToLocation(Loc))
9370 return Loc.IP;
9371
9372 assert(X.Var->getType()->isPointerTy() &&
9373 "OMP Atomic expects a pointer to target memory");
9374 Type *XElemTy = X.ElemTy;
9375 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9376 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9377 "OMP atomic write expected a scalar type");
9378
9379 if (XElemTy->isIntegerTy()) {
9380 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9381 XSt->setAtomic(AO);
9382 } else if (XElemTy->isStructTy()) {
9383 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9384 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9385 unsigned LoadSize =
9386 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9387 OpenMPIRBuilder::AtomicInfo atomicInfo(
9388 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9389 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9390 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9391 OldVal->eraseFromParent();
9392 } else {
9393 // We need to bitcast and perform atomic op as integers
9394 IntegerType *IntCastTy =
9395 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9396 Value *ExprCast =
9397 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9398 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9399 XSt->setAtomic(AO);
9400 }
9401
9402 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9403 return Builder.saveIP();
9404}
9405
9406OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9407 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9408 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9409 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9410 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9411 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9412 if (!updateToLocation(Loc))
9413 return Loc.IP;
9414
9415 LLVM_DEBUG({
9416 Type *XTy = X.Var->getType();
9417 assert(XTy->isPointerTy() &&
9418 "OMP Atomic expects a pointer to target memory");
9419 Type *XElemTy = X.ElemTy;
9420 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9421 XElemTy->isPointerTy()) &&
9422 "OMP atomic update expected a scalar type");
9423 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9424 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9425 "OpenMP atomic does not support LT or GT operations");
9426 });
9427
9428 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9429 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9430 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9431 if (!AtomicResult)
9432 return AtomicResult.takeError();
9433 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9434 return Builder.saveIP();
9435}
9436
9437// FIXME: Duplicating AtomicExpand
9438Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9439 AtomicRMWInst::BinOp RMWOp) {
9440 switch (RMWOp) {
9441 case AtomicRMWInst::Add:
9442 return Builder.CreateAdd(Src1, Src2);
9443 case AtomicRMWInst::Sub:
9444 return Builder.CreateSub(Src1, Src2);
9445 case AtomicRMWInst::And:
9446 return Builder.CreateAnd(Src1, Src2);
9448 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9449 case AtomicRMWInst::Or:
9450 return Builder.CreateOr(Src1, Src2);
9451 case AtomicRMWInst::Xor:
9452 return Builder.CreateXor(Src1, Src2);
9457 case AtomicRMWInst::Max:
9458 case AtomicRMWInst::Min:
9469 llvm_unreachable("Unsupported atomic update operation");
9470 }
9471 llvm_unreachable("Unsupported atomic update operation");
9472}
9473
9474Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9475 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9477 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9478 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9479 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9480 // or a complex datatype.
9481 bool emitRMWOp = false;
9482 switch (RMWOp) {
9483 case AtomicRMWInst::Add:
9484 case AtomicRMWInst::And:
9486 case AtomicRMWInst::Or:
9487 case AtomicRMWInst::Xor:
9489 emitRMWOp = XElemTy;
9490 break;
9491 case AtomicRMWInst::Sub:
9492 emitRMWOp = (IsXBinopExpr && XElemTy);
9493 break;
9494 default:
9495 emitRMWOp = false;
9496 }
9497 emitRMWOp &= XElemTy->isIntegerTy();
9498
9499 std::pair<Value *, Value *> Res;
9500 if (emitRMWOp) {
9501 AtomicRMWInst *RMWInst =
9502 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9503 if (T.isAMDGPU()) {
9504 if (IsIgnoreDenormalMode)
9505 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9506 llvm::MDNode::get(Builder.getContext(), {}));
9507 if (!IsFineGrainedMemory)
9508 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9509 llvm::MDNode::get(Builder.getContext(), {}));
9510 if (!IsRemoteMemory)
9511 RMWInst->setMetadata("amdgpu.no.remote.memory",
9512 llvm::MDNode::get(Builder.getContext(), {}));
9513 }
9514 Res.first = RMWInst;
9515 // not needed except in case of postfix captures. Generate anyway for
9516 // consistency with the else part. Will be removed with any DCE pass.
9517 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9518 if (RMWOp == AtomicRMWInst::Xchg)
9519 Res.second = Res.first;
9520 else
9521 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9522 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9523 XElemTy->isStructTy()) {
9524 LoadInst *OldVal =
9525 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9526 OldVal->setAtomic(AO);
9527 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9528 unsigned LoadSize =
9529 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9530
9531 OpenMPIRBuilder::AtomicInfo atomicInfo(
9532 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9533 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9534 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9535 BasicBlock *CurBB = Builder.GetInsertBlock();
9536 Instruction *CurBBTI = CurBB->getTerminator();
9537 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9538 BasicBlock *ExitBB =
9539 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9540 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9541 X->getName() + ".atomic.cont");
9542 ContBB->getTerminator()->eraseFromParent();
9543 Builder.restoreIP(AllocaIP);
9544 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9545 NewAtomicAddr->setName(X->getName() + "x.new.val");
9546 Builder.SetInsertPoint(ContBB);
9547 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9548 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9549 Value *OldExprVal = PHI;
9550 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9551 if (!CBResult)
9552 return CBResult.takeError();
9553 Value *Upd = *CBResult;
9554 Builder.CreateStore(Upd, NewAtomicAddr);
9557 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9558 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9559 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9560 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9561 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9562 OldVal->eraseFromParent();
9563 Res.first = OldExprVal;
9564 Res.second = Upd;
9565
9566 if (UnreachableInst *ExitTI =
9568 CurBBTI->eraseFromParent();
9569 Builder.SetInsertPoint(ExitBB);
9570 } else {
9571 Builder.SetInsertPoint(ExitTI);
9572 }
9573 } else {
9574 IntegerType *IntCastTy =
9575 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9576 LoadInst *OldVal =
9577 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9578 OldVal->setAtomic(AO);
9579 // CurBB
9580 // | /---\
9581 // ContBB |
9582 // | \---/
9583 // ExitBB
9584 BasicBlock *CurBB = Builder.GetInsertBlock();
9585 Instruction *CurBBTI = CurBB->getTerminator();
9586 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9587 BasicBlock *ExitBB =
9588 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9589 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9590 X->getName() + ".atomic.cont");
9591 ContBB->getTerminator()->eraseFromParent();
9592 Builder.restoreIP(AllocaIP);
9593 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9594 NewAtomicAddr->setName(X->getName() + "x.new.val");
9595 Builder.SetInsertPoint(ContBB);
9596 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9597 PHI->addIncoming(OldVal, CurBB);
9598 bool IsIntTy = XElemTy->isIntegerTy();
9599 Value *OldExprVal = PHI;
9600 if (!IsIntTy) {
9601 if (XElemTy->isFloatingPointTy()) {
9602 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9603 X->getName() + ".atomic.fltCast");
9604 } else {
9605 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9606 X->getName() + ".atomic.ptrCast");
9607 }
9608 }
9609
9610 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9611 if (!CBResult)
9612 return CBResult.takeError();
9613 Value *Upd = *CBResult;
9614 Builder.CreateStore(Upd, NewAtomicAddr);
9615 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9618 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9619 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9620 Result->setVolatile(VolatileX);
9621 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9622 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9623 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9624 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9625
9626 Res.first = OldExprVal;
9627 Res.second = Upd;
9628
9629 // set Insertion point in exit block
9630 if (UnreachableInst *ExitTI =
9632 CurBBTI->eraseFromParent();
9633 Builder.SetInsertPoint(ExitBB);
9634 } else {
9635 Builder.SetInsertPoint(ExitTI);
9636 }
9637 }
9638
9639 return Res;
9640}
9641
9642OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9643 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9644 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9645 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9646 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9647 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9648 if (!updateToLocation(Loc))
9649 return Loc.IP;
9650
9651 LLVM_DEBUG({
9652 Type *XTy = X.Var->getType();
9653 assert(XTy->isPointerTy() &&
9654 "OMP Atomic expects a pointer to target memory");
9655 Type *XElemTy = X.ElemTy;
9656 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9657 XElemTy->isPointerTy()) &&
9658 "OMP atomic capture expected a scalar type");
9659 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9660 "OpenMP atomic does not support LT or GT operations");
9661 });
9662
9663 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9664 // 'x' is simply atomically rewritten with 'expr'.
9665 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9666 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9667 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9668 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9669 if (!AtomicResult)
9670 return AtomicResult.takeError();
9671 Value *CapturedVal =
9672 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9673 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9674
9675 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9676 return Builder.saveIP();
9677}
9678
9679OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9680 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9681 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9682 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9683 bool IsFailOnly) {
9684
9686 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9687 IsPostfixUpdate, IsFailOnly, Failure);
9688}
9689
9690OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9691 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9692 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9693 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9694 bool IsFailOnly, AtomicOrdering Failure) {
9695
9696 if (!updateToLocation(Loc))
9697 return Loc.IP;
9698
9699 assert(X.Var->getType()->isPointerTy() &&
9700 "OMP atomic expects a pointer to target memory");
9701 // compare capture
9702 if (V.Var) {
9703 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9704 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9705 }
9706
9707 bool IsInteger = E->getType()->isIntegerTy();
9708
9709 if (Op == OMPAtomicCompareOp::EQ) {
9710 AtomicCmpXchgInst *Result = nullptr;
9711 if (!IsInteger) {
9712 IntegerType *IntCastTy =
9713 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9714 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9715 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9716 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9717 AO, Failure);
9718 } else {
9719 Result =
9720 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9721 }
9722
9723 if (V.Var) {
9724 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9725 if (!IsInteger)
9726 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9727 assert(OldValue->getType() == V.ElemTy &&
9728 "OldValue and V must be of same type");
9729 if (IsPostfixUpdate) {
9730 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9731 } else {
9732 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9733 if (IsFailOnly) {
9734 // CurBB----
9735 // | |
9736 // v |
9737 // ContBB |
9738 // | |
9739 // v |
9740 // ExitBB <-
9741 //
9742 // where ContBB only contains the store of old value to 'v'.
9743 BasicBlock *CurBB = Builder.GetInsertBlock();
9744 Instruction *CurBBTI = CurBB->getTerminator();
9745 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9746 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9747 CurBBTI, X.Var->getName() + ".atomic.exit");
9748 BasicBlock *ContBB = CurBB->splitBasicBlock(
9749 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9750 ContBB->getTerminator()->eraseFromParent();
9751 CurBB->getTerminator()->eraseFromParent();
9752
9753 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9754
9755 Builder.SetInsertPoint(ContBB);
9756 Builder.CreateStore(OldValue, V.Var);
9757 Builder.CreateBr(ExitBB);
9758
9759 if (UnreachableInst *ExitTI =
9761 CurBBTI->eraseFromParent();
9762 Builder.SetInsertPoint(ExitBB);
9763 } else {
9764 Builder.SetInsertPoint(ExitTI);
9765 }
9766 } else {
9767 Value *CapturedValue =
9768 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9769 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9770 }
9771 }
9772 }
9773 // The comparison result has to be stored.
9774 if (R.Var) {
9775 assert(R.Var->getType()->isPointerTy() &&
9776 "r.var must be of pointer type");
9777 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9778
9779 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9780 Value *ResultCast = R.IsSigned
9781 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9782 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9783 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9784 }
9785 } else {
9786 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9787 "Op should be either max or min at this point");
9788 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9789
9790 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9791 // Let's take max as example.
9792 // OpenMP form:
9793 // x = x > expr ? expr : x;
9794 // LLVM form:
9795 // *ptr = *ptr > val ? *ptr : val;
9796 // We need to transform to LLVM form.
9797 // x = x <= expr ? x : expr;
9799 if (IsXBinopExpr) {
9800 if (IsInteger) {
9801 if (X.IsSigned)
9802 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9804 else
9805 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9807 } else {
9808 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9810 }
9811 } else {
9812 if (IsInteger) {
9813 if (X.IsSigned)
9814 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9816 else
9817 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9819 } else {
9820 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9822 }
9823 }
9824
9825 AtomicRMWInst *OldValue =
9826 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9827 if (V.Var) {
9828 Value *CapturedValue = nullptr;
9829 if (IsPostfixUpdate) {
9830 CapturedValue = OldValue;
9831 } else {
9832 CmpInst::Predicate Pred;
9833 switch (NewOp) {
9834 case AtomicRMWInst::Max:
9835 Pred = CmpInst::ICMP_SGT;
9836 break;
9838 Pred = CmpInst::ICMP_UGT;
9839 break;
9841 Pred = CmpInst::FCMP_OGT;
9842 break;
9843 case AtomicRMWInst::Min:
9844 Pred = CmpInst::ICMP_SLT;
9845 break;
9847 Pred = CmpInst::ICMP_ULT;
9848 break;
9850 Pred = CmpInst::FCMP_OLT;
9851 break;
9852 default:
9853 llvm_unreachable("unexpected comparison op");
9854 }
9855 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9856 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9857 }
9858 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9859 }
9860 }
9861
9862 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9863
9864 return Builder.saveIP();
9865}
9866
9867OpenMPIRBuilder::InsertPointOrErrorTy
9868OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9869 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9870 Value *NumTeamsUpper, Value *ThreadLimit,
9871 Value *IfExpr) {
9872 if (!updateToLocation(Loc))
9873 return InsertPointTy();
9874
9875 uint32_t SrcLocStrSize;
9876 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9877 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9878 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9879
9880 // Outer allocation basicblock is the entry block of the current function.
9881 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9882 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9883 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9884 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9885 }
9886
9887 // The current basic block is split into four basic blocks. After outlining,
9888 // they will be mapped as follows:
9889 // ```
9890 // def current_fn() {
9891 // current_basic_block:
9892 // br label %teams.exit
9893 // teams.exit:
9894 // ; instructions after teams
9895 // }
9896 //
9897 // def outlined_fn() {
9898 // teams.alloca:
9899 // br label %teams.body
9900 // teams.body:
9901 // ; instructions within teams body
9902 // }
9903 // ```
9904 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9905 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9906 BasicBlock *AllocaBB =
9907 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9908
9909 bool SubClausesPresent =
9910 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9911 // Push num_teams
9912 if (!Config.isTargetDevice() && SubClausesPresent) {
9913 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9914 "if lowerbound is non-null, then upperbound must also be non-null "
9915 "for bounds on num_teams");
9916
9917 if (NumTeamsUpper == nullptr)
9918 NumTeamsUpper = Builder.getInt32(0);
9919
9920 if (NumTeamsLower == nullptr)
9921 NumTeamsLower = NumTeamsUpper;
9922
9923 if (IfExpr) {
9924 assert(IfExpr->getType()->isIntegerTy() &&
9925 "argument to if clause must be an integer value");
9926
9927 // upper = ifexpr ? upper : 1
9928 if (IfExpr->getType() != Int1)
9929 IfExpr = Builder.CreateICmpNE(IfExpr,
9930 ConstantInt::get(IfExpr->getType(), 0));
9931 NumTeamsUpper = Builder.CreateSelect(
9932 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9933
9934 // lower = ifexpr ? lower : 1
9935 NumTeamsLower = Builder.CreateSelect(
9936 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9937 }
9938
9939 if (ThreadLimit == nullptr)
9940 ThreadLimit = Builder.getInt32(0);
9941
9942 Value *ThreadNum = getOrCreateThreadID(Ident);
9943 Builder.CreateCall(
9944 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9945 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9946 }
9947 // Generate the body of teams.
9948 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9949 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9950 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9951 return Err;
9952
9953 OutlineInfo OI;
9954 OI.EntryBB = AllocaBB;
9955 OI.ExitBB = ExitBB;
9956 OI.OuterAllocaBB = &OuterAllocaBB;
9957
9958 // Insert fake values for global tid and bound tid.
9960 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9961 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9962 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9963 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9964 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9965
9966 auto HostPostOutlineCB = [this, Ident,
9967 ToBeDeleted](Function &OutlinedFn) mutable {
9968 // The stale call instruction will be replaced with a new call instruction
9969 // for runtime call with the outlined function.
9970
9971 assert(OutlinedFn.hasOneUse() &&
9972 "there must be a single user for the outlined function");
9973 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9974 ToBeDeleted.push_back(StaleCI);
9975
9976 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9977 "Outlined function must have two or three arguments only");
9978
9979 bool HasShared = OutlinedFn.arg_size() == 3;
9980
9981 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9982 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9983 if (HasShared)
9984 OutlinedFn.getArg(2)->setName("data");
9985
9986 // Call to the runtime function for teams in the current function.
9987 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9988 "outlined function.");
9989 Builder.SetInsertPoint(StaleCI);
9991 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9992 if (HasShared)
9993 Args.push_back(StaleCI->getArgOperand(2));
9994 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
9995 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9996 Args);
9997
9998 for (Instruction *I : llvm::reverse(ToBeDeleted))
9999 I->eraseFromParent();
10000 };
10001
10002 if (!Config.isTargetDevice())
10003 OI.PostOutlineCB = HostPostOutlineCB;
10004
10005 addOutlineInfo(std::move(OI));
10006
10007 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10008
10009 return Builder.saveIP();
10010}
10011
10012OpenMPIRBuilder::InsertPointOrErrorTy
10013OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10014 InsertPointTy OuterAllocaIP,
10015 BodyGenCallbackTy BodyGenCB) {
10016 if (!updateToLocation(Loc))
10017 return InsertPointTy();
10018
10019 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10020
10021 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10022 BasicBlock *BodyBB =
10023 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10024 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10025 }
10026 BasicBlock *ExitBB =
10027 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10028 BasicBlock *BodyBB =
10029 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10030 BasicBlock *AllocaBB =
10031 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10032
10033 // Generate the body of distribute clause
10034 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10035 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10036 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10037 return Err;
10038
10039 // When using target we use different runtime functions which require a
10040 // callback.
10041 if (Config.isTargetDevice()) {
10042 OutlineInfo OI;
10043 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10044 OI.EntryBB = AllocaBB;
10045 OI.ExitBB = ExitBB;
10046
10047 addOutlineInfo(std::move(OI));
10048 }
10049 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10050
10051 return Builder.saveIP();
10052}
10053
10055OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10056 std::string VarName) {
10057 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10059 Names.size()),
10060 Names);
10061 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10062 M, MapNamesArrayInit->getType(),
10063 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10064 VarName);
10065 return MapNamesArrayGlobal;
10066}
10067
10068// Create all simple and struct types exposed by the runtime and remember
10069// the llvm::PointerTypes of them for easy access later.
10070void OpenMPIRBuilder::initializeTypes(Module &M) {
10071 LLVMContext &Ctx = M.getContext();
10072 StructType *T;
10073 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10074#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10075#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10076 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10077 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10078#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10079 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10080 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10081#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10082 T = StructType::getTypeByName(Ctx, StructName); \
10083 if (!T) \
10084 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10085 VarName = T; \
10086 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10087#include "llvm/Frontend/OpenMP/OMPKinds.def"
10088}
10089
10090void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10092 SmallVectorImpl<BasicBlock *> &BlockVector) {
10094 BlockSet.insert(EntryBB);
10095 BlockSet.insert(ExitBB);
10096
10097 Worklist.push_back(EntryBB);
10098 while (!Worklist.empty()) {
10099 BasicBlock *BB = Worklist.pop_back_val();
10100 BlockVector.push_back(BB);
10101 for (BasicBlock *SuccBB : successors(BB))
10102 if (BlockSet.insert(SuccBB).second)
10103 Worklist.push_back(SuccBB);
10104 }
10105}
10106
10107void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10108 uint64_t Size, int32_t Flags,
10110 StringRef Name) {
10111 if (!Config.isGPU()) {
10114 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10115 return;
10116 }
10117 // TODO: Add support for global variables on the device after declare target
10118 // support.
10119 Function *Fn = dyn_cast<Function>(Addr);
10120 if (!Fn)
10121 return;
10122
10123 // Add a function attribute for the kernel.
10124 Fn->addFnAttr("kernel");
10125 if (T.isAMDGCN())
10126 Fn->addFnAttr("uniform-work-group-size", "true");
10127 Fn->addFnAttr(Attribute::MustProgress);
10128}
10129
10130// We only generate metadata for function that contain target regions.
10131void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10132 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10133
10134 // If there are no entries, we don't need to do anything.
10135 if (OffloadInfoManager.empty())
10136 return;
10137
10138 LLVMContext &C = M.getContext();
10139 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10140 TargetRegionEntryInfo>,
10141 16>
10142 OrderedEntries(OffloadInfoManager.size());
10143
10144 // Auxiliary methods to create metadata values and strings.
10145 auto &&GetMDInt = [this](unsigned V) {
10146 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10147 };
10148
10149 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10150
10151 // Create the offloading info metadata node.
10152 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10153 auto &&TargetRegionMetadataEmitter =
10154 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10155 const TargetRegionEntryInfo &EntryInfo,
10156 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10157 // Generate metadata for target regions. Each entry of this metadata
10158 // contains:
10159 // - Entry 0 -> Kind of this type of metadata (0).
10160 // - Entry 1 -> Device ID of the file where the entry was identified.
10161 // - Entry 2 -> File ID of the file where the entry was identified.
10162 // - Entry 3 -> Mangled name of the function where the entry was
10163 // identified.
10164 // - Entry 4 -> Line in the file where the entry was identified.
10165 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10166 // - Entry 6 -> Order the entry was created.
10167 // The first element of the metadata node is the kind.
10168 Metadata *Ops[] = {
10169 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10170 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10171 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10172 GetMDInt(E.getOrder())};
10173
10174 // Save this entry in the right position of the ordered entries array.
10175 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10176
10177 // Add metadata to the named metadata node.
10178 MD->addOperand(MDNode::get(C, Ops));
10179 };
10180
10181 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10182
10183 // Create function that emits metadata for each device global variable entry;
10184 auto &&DeviceGlobalVarMetadataEmitter =
10185 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10186 StringRef MangledName,
10187 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10188 // Generate metadata for global variables. Each entry of this metadata
10189 // contains:
10190 // - Entry 0 -> Kind of this type of metadata (1).
10191 // - Entry 1 -> Mangled name of the variable.
10192 // - Entry 2 -> Declare target kind.
10193 // - Entry 3 -> Order the entry was created.
10194 // The first element of the metadata node is the kind.
10195 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10196 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10197
10198 // Save this entry in the right position of the ordered entries array.
10199 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10200 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10201
10202 // Add metadata to the named metadata node.
10203 MD->addOperand(MDNode::get(C, Ops));
10204 };
10205
10206 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10207 DeviceGlobalVarMetadataEmitter);
10208
10209 for (const auto &E : OrderedEntries) {
10210 assert(E.first && "All ordered entries must exist!");
10211 if (const auto *CE =
10213 E.first)) {
10214 if (!CE->getID() || !CE->getAddress()) {
10215 // Do not blame the entry if the parent funtion is not emitted.
10216 TargetRegionEntryInfo EntryInfo = E.second;
10217 StringRef FnName = EntryInfo.ParentName;
10218 if (!M.getNamedValue(FnName))
10219 continue;
10220 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10221 continue;
10222 }
10223 createOffloadEntry(CE->getID(), CE->getAddress(),
10224 /*Size=*/0, CE->getFlags(),
10226 } else if (const auto *CE = dyn_cast<
10227 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10228 E.first)) {
10229 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10230 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10231 CE->getFlags());
10232 switch (Flags) {
10233 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10234 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10235 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10236 continue;
10237 if (!CE->getAddress()) {
10238 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10239 continue;
10240 }
10241 // The vaiable has no definition - no need to add the entry.
10242 if (CE->getVarSize() == 0)
10243 continue;
10244 break;
10245 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10246 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10247 (!Config.isTargetDevice() && CE->getAddress())) &&
10248 "Declaret target link address is set.");
10249 if (Config.isTargetDevice())
10250 continue;
10251 if (!CE->getAddress()) {
10252 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10253 continue;
10254 }
10255 break;
10256 default:
10257 break;
10258 }
10259
10260 // Hidden or internal symbols on the device are not externally visible.
10261 // We should not attempt to register them by creating an offloading
10262 // entry. Indirect variables are handled separately on the device.
10263 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10264 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10265 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10266 continue;
10267
10268 // Indirect globals need to use a special name that doesn't match the name
10269 // of the associated host global.
10270 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10271 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10272 Flags, CE->getLinkage(), CE->getVarName());
10273 else
10274 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10275 Flags, CE->getLinkage());
10276
10277 } else {
10278 llvm_unreachable("Unsupported entry kind.");
10279 }
10280 }
10281
10282 // Emit requires directive globals to a special entry so the runtime can
10283 // register them when the device image is loaded.
10284 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10285 // entries should be redesigned to better suit this use-case.
10286 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10290 ".requires", /*Size=*/0,
10291 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10292 Config.getRequiresFlags());
10293}
10294
10295void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10296 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10297 unsigned FileID, unsigned Line, unsigned Count) {
10298 raw_svector_ostream OS(Name);
10299 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10300 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10301 if (Count)
10302 OS << "_" << Count;
10303}
10304
10305void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10306 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10307 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10308 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10309 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10310 EntryInfo.Line, NewCount);
10311}
10312
10313TargetRegionEntryInfo
10314OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10315 StringRef ParentName) {
10316 sys::fs::UniqueID ID(0xdeadf17e, 0);
10317 auto FileIDInfo = CallBack();
10318 uint64_t FileID = 0;
10319 std::error_code EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID);
10320 // If the inode ID could not be determined, create a hash value
10321 // the current file name and use that as an ID.
10322 if (EC)
10323 FileID = hash_value(std::get<0>(FileIDInfo));
10324 else
10325 FileID = ID.getFile();
10326
10327 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10328 std::get<1>(FileIDInfo));
10329}
10330
10331unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10332 unsigned Offset = 0;
10333 for (uint64_t Remain =
10334 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10336 !(Remain & 1); Remain = Remain >> 1)
10337 Offset++;
10338 return Offset;
10339}
10340
10342OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10343 // Rotate by getFlagMemberOffset() bits.
10344 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10345 << getFlagMemberOffset());
10346}
10347
10348void OpenMPIRBuilder::setCorrectMemberOfFlag(
10350 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10351 // If the entry is PTR_AND_OBJ but has not been marked with the special
10352 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10353 // marked as MEMBER_OF.
10354 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10356 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10359 return;
10360
10361 // Reset the placeholder value to prepare the flag for the assignment of the
10362 // proper MEMBER_OF value.
10363 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10364 Flags |= MemberOfFlag;
10365}
10366
10367Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10368 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10369 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10370 bool IsDeclaration, bool IsExternallyVisible,
10371 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10372 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10373 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10374 std::function<Constant *()> GlobalInitializer,
10375 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10376 // TODO: convert this to utilise the IRBuilder Config rather than
10377 // a passed down argument.
10378 if (OpenMPSIMD)
10379 return nullptr;
10380
10381 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10382 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10383 CaptureClause ==
10384 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10385 Config.hasRequiresUnifiedSharedMemory())) {
10386 SmallString<64> PtrName;
10387 {
10388 raw_svector_ostream OS(PtrName);
10389 OS << MangledName;
10390 if (!IsExternallyVisible)
10391 OS << format("_%x", EntryInfo.FileID);
10392 OS << "_decl_tgt_ref_ptr";
10393 }
10394
10395 Value *Ptr = M.getNamedValue(PtrName);
10396
10397 if (!Ptr) {
10398 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10399 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10400
10401 auto *GV = cast<GlobalVariable>(Ptr);
10402 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10403
10404 if (!Config.isTargetDevice()) {
10405 if (GlobalInitializer)
10406 GV->setInitializer(GlobalInitializer());
10407 else
10408 GV->setInitializer(GlobalValue);
10409 }
10410
10411 registerTargetGlobalVariable(
10412 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10413 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10414 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10415 }
10416
10417 return cast<Constant>(Ptr);
10418 }
10419
10420 return nullptr;
10421}
10422
10423void OpenMPIRBuilder::registerTargetGlobalVariable(
10424 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10425 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10426 bool IsDeclaration, bool IsExternallyVisible,
10427 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10428 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10429 std::vector<Triple> TargetTriple,
10430 std::function<Constant *()> GlobalInitializer,
10431 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10432 Constant *Addr) {
10433 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10434 (TargetTriple.empty() && !Config.isTargetDevice()))
10435 return;
10436
10437 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10439 int64_t VarSize;
10441
10442 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10443 CaptureClause ==
10444 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10445 !Config.hasRequiresUnifiedSharedMemory()) {
10446 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10447 VarName = MangledName;
10448 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10449
10450 if (!IsDeclaration)
10451 VarSize = divideCeil(
10452 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10453 else
10454 VarSize = 0;
10455 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10456
10457 // This is a workaround carried over from Clang which prevents undesired
10458 // optimisation of internal variables.
10459 if (Config.isTargetDevice() &&
10460 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10461 // Do not create a "ref-variable" if the original is not also available
10462 // on the host.
10463 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10464 return;
10465
10466 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10467
10468 if (!M.getNamedValue(RefName)) {
10469 Constant *AddrRef =
10470 getOrCreateInternalVariable(Addr->getType(), RefName);
10471 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10472 GvAddrRef->setConstant(true);
10473 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10474 GvAddrRef->setInitializer(Addr);
10475 GeneratedRefs.push_back(GvAddrRef);
10476 }
10477 }
10478 } else {
10479 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10480 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10481 else
10482 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10483
10484 if (Config.isTargetDevice()) {
10485 VarName = (Addr) ? Addr->getName() : "";
10486 Addr = nullptr;
10487 } else {
10488 Addr = getAddrOfDeclareTargetVar(
10489 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10490 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10491 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10492 VarName = (Addr) ? Addr->getName() : "";
10493 }
10494 VarSize = M.getDataLayout().getPointerSize();
10496 }
10497
10498 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10499 Flags, Linkage);
10500}
10501
10502/// Loads all the offload entries information from the host IR
10503/// metadata.
10504void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10505 // If we are in target mode, load the metadata from the host IR. This code has
10506 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10507
10508 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10509 if (!MD)
10510 return;
10511
10512 for (MDNode *MN : MD->operands()) {
10513 auto &&GetMDInt = [MN](unsigned Idx) {
10514 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10515 return cast<ConstantInt>(V->getValue())->getZExtValue();
10516 };
10517
10518 auto &&GetMDString = [MN](unsigned Idx) {
10519 auto *V = cast<MDString>(MN->getOperand(Idx));
10520 return V->getString();
10521 };
10522
10523 switch (GetMDInt(0)) {
10524 default:
10525 llvm_unreachable("Unexpected metadata!");
10526 break;
10527 case OffloadEntriesInfoManager::OffloadEntryInfo::
10528 OffloadingEntryInfoTargetRegion: {
10529 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10530 /*DeviceID=*/GetMDInt(1),
10531 /*FileID=*/GetMDInt(2),
10532 /*Line=*/GetMDInt(4),
10533 /*Count=*/GetMDInt(5));
10534 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10535 /*Order=*/GetMDInt(6));
10536 break;
10537 }
10538 case OffloadEntriesInfoManager::OffloadEntryInfo::
10539 OffloadingEntryInfoDeviceGlobalVar:
10540 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10541 /*MangledName=*/GetMDString(1),
10542 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10543 /*Flags=*/GetMDInt(2)),
10544 /*Order=*/GetMDInt(3));
10545 break;
10546 }
10547 }
10548}
10549
10550void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10551 StringRef HostFilePath) {
10552 if (HostFilePath.empty())
10553 return;
10554
10555 auto Buf = VFS.getBufferForFile(HostFilePath);
10556 if (std::error_code Err = Buf.getError()) {
10557 report_fatal_error(("error opening host file from host file path inside of "
10558 "OpenMPIRBuilder: " +
10559 Err.message())
10560 .c_str());
10561 }
10562
10563 LLVMContext Ctx;
10565 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10566 if (std::error_code Err = M.getError()) {
10568 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10569 .c_str());
10570 }
10571
10572 loadOffloadInfoMetadata(*M.get());
10573}
10574
10575//===----------------------------------------------------------------------===//
10576// OffloadEntriesInfoManager
10577//===----------------------------------------------------------------------===//
10578
10579bool OffloadEntriesInfoManager::empty() const {
10580 return OffloadEntriesTargetRegion.empty() &&
10581 OffloadEntriesDeviceGlobalVar.empty();
10582}
10583
10584unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10585 const TargetRegionEntryInfo &EntryInfo) const {
10586 auto It = OffloadEntriesTargetRegionCount.find(
10587 getTargetRegionEntryCountKey(EntryInfo));
10588 if (It == OffloadEntriesTargetRegionCount.end())
10589 return 0;
10590 return It->second;
10591}
10592
10593void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10594 const TargetRegionEntryInfo &EntryInfo) {
10595 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10596 EntryInfo.Count + 1;
10597}
10598
10599/// Initialize target region entry.
10600void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10601 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10602 OffloadEntriesTargetRegion[EntryInfo] =
10603 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10604 OMPTargetRegionEntryTargetRegion);
10605 ++OffloadingEntriesNum;
10606}
10607
10608void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10609 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10610 OMPTargetRegionEntryKind Flags) {
10611 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10612
10613 // Update the EntryInfo with the next available count for this location.
10614 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10615
10616 // If we are emitting code for a target, the entry is already initialized,
10617 // only has to be registered.
10618 if (OMPBuilder->Config.isTargetDevice()) {
10619 // This could happen if the device compilation is invoked standalone.
10620 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10621 return;
10622 }
10623 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10624 Entry.setAddress(Addr);
10625 Entry.setID(ID);
10626 Entry.setFlags(Flags);
10627 } else {
10628 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10629 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10630 return;
10631 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10632 "Target region entry already registered!");
10633 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10634 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10635 ++OffloadingEntriesNum;
10636 }
10637 incrementTargetRegionEntryInfoCount(EntryInfo);
10638}
10639
10640bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10641 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10642
10643 // Update the EntryInfo with the next available count for this location.
10644 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10645
10646 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10647 if (It == OffloadEntriesTargetRegion.end()) {
10648 return false;
10649 }
10650 // Fail if this entry is already registered.
10651 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10652 return false;
10653 return true;
10654}
10655
10656void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10657 const OffloadTargetRegionEntryInfoActTy &Action) {
10658 // Scan all target region entries and perform the provided action.
10659 for (const auto &It : OffloadEntriesTargetRegion) {
10660 Action(It.first, It.second);
10661 }
10662}
10663
10664void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10665 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10666 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10667 ++OffloadingEntriesNum;
10668}
10669
10670void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10671 StringRef VarName, Constant *Addr, int64_t VarSize,
10672 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10673 if (OMPBuilder->Config.isTargetDevice()) {
10674 // This could happen if the device compilation is invoked standalone.
10675 if (!hasDeviceGlobalVarEntryInfo(VarName))
10676 return;
10677 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10678 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10679 if (Entry.getVarSize() == 0) {
10680 Entry.setVarSize(VarSize);
10681 Entry.setLinkage(Linkage);
10682 }
10683 return;
10684 }
10685 Entry.setVarSize(VarSize);
10686 Entry.setLinkage(Linkage);
10687 Entry.setAddress(Addr);
10688 } else {
10689 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10690 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10691 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10692 "Entry not initialized!");
10693 if (Entry.getVarSize() == 0) {
10694 Entry.setVarSize(VarSize);
10695 Entry.setLinkage(Linkage);
10696 }
10697 return;
10698 }
10699 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10700 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10701 Addr, VarSize, Flags, Linkage,
10702 VarName.str());
10703 else
10704 OffloadEntriesDeviceGlobalVar.try_emplace(
10705 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10706 ++OffloadingEntriesNum;
10707 }
10708}
10709
10710void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10711 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10712 // Scan all target region entries and perform the provided action.
10713 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10714 Action(E.getKey(), E.getValue());
10715}
10716
10717//===----------------------------------------------------------------------===//
10718// CanonicalLoopInfo
10719//===----------------------------------------------------------------------===//
10720
10721void CanonicalLoopInfo::collectControlBlocks(
10723 // We only count those BBs as control block for which we do not need to
10724 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10725 // flow. For consistency, this also means we do not add the Body block, which
10726 // is just the entry to the body code.
10727 BBs.reserve(BBs.size() + 6);
10728 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10729}
10730
10731BasicBlock *CanonicalLoopInfo::getPreheader() const {
10732 assert(isValid() && "Requires a valid canonical loop");
10733 for (BasicBlock *Pred : predecessors(Header)) {
10734 if (Pred != Latch)
10735 return Pred;
10736 }
10737 llvm_unreachable("Missing preheader");
10738}
10739
10740void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10741 assert(isValid() && "Requires a valid canonical loop");
10742
10743 Instruction *CmpI = &getCond()->front();
10744 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10745 CmpI->setOperand(1, TripCount);
10746
10747#ifndef NDEBUG
10748 assertOK();
10749#endif
10750}
10751
10752void CanonicalLoopInfo::mapIndVar(
10753 llvm::function_ref<Value *(Instruction *)> Updater) {
10754 assert(isValid() && "Requires a valid canonical loop");
10755
10756 Instruction *OldIV = getIndVar();
10757
10758 // Record all uses excluding those introduced by the updater. Uses by the
10759 // CanonicalLoopInfo itself to keep track of the number of iterations are
10760 // excluded.
10761 SmallVector<Use *> ReplacableUses;
10762 for (Use &U : OldIV->uses()) {
10763 auto *User = dyn_cast<Instruction>(U.getUser());
10764 if (!User)
10765 continue;
10766 if (User->getParent() == getCond())
10767 continue;
10768 if (User->getParent() == getLatch())
10769 continue;
10770 ReplacableUses.push_back(&U);
10771 }
10772
10773 // Run the updater that may introduce new uses
10774 Value *NewIV = Updater(OldIV);
10775
10776 // Replace the old uses with the value returned by the updater.
10777 for (Use *U : ReplacableUses)
10778 U->set(NewIV);
10779
10780#ifndef NDEBUG
10781 assertOK();
10782#endif
10783}
10784
10785void CanonicalLoopInfo::assertOK() const {
10786#ifndef NDEBUG
10787 // No constraints if this object currently does not describe a loop.
10788 if (!isValid())
10789 return;
10790
10791 BasicBlock *Preheader = getPreheader();
10792 BasicBlock *Body = getBody();
10793 BasicBlock *After = getAfter();
10794
10795 // Verify standard control-flow we use for OpenMP loops.
10796 assert(Preheader);
10797 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10798 "Preheader must terminate with unconditional branch");
10799 assert(Preheader->getSingleSuccessor() == Header &&
10800 "Preheader must jump to header");
10801
10802 assert(Header);
10803 assert(isa<BranchInst>(Header->getTerminator()) &&
10804 "Header must terminate with unconditional branch");
10805 assert(Header->getSingleSuccessor() == Cond &&
10806 "Header must jump to exiting block");
10807
10808 assert(Cond);
10809 assert(Cond->getSinglePredecessor() == Header &&
10810 "Exiting block only reachable from header");
10811
10812 assert(isa<BranchInst>(Cond->getTerminator()) &&
10813 "Exiting block must terminate with conditional branch");
10814 assert(size(successors(Cond)) == 2 &&
10815 "Exiting block must have two successors");
10816 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10817 "Exiting block's first successor jump to the body");
10818 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10819 "Exiting block's second successor must exit the loop");
10820
10821 assert(Body);
10822 assert(Body->getSinglePredecessor() == Cond &&
10823 "Body only reachable from exiting block");
10824 assert(!isa<PHINode>(Body->front()));
10825
10826 assert(Latch);
10828 "Latch must terminate with unconditional branch");
10829 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10830 // TODO: To support simple redirecting of the end of the body code that has
10831 // multiple; introduce another auxiliary basic block like preheader and after.
10832 assert(Latch->getSinglePredecessor() != nullptr);
10833 assert(!isa<PHINode>(Latch->front()));
10834
10835 assert(Exit);
10836 assert(isa<BranchInst>(Exit->getTerminator()) &&
10837 "Exit block must terminate with unconditional branch");
10838 assert(Exit->getSingleSuccessor() == After &&
10839 "Exit block must jump to after block");
10840
10841 assert(After);
10842 assert(After->getSinglePredecessor() == Exit &&
10843 "After block only reachable from exit block");
10844 assert(After->empty() || !isa<PHINode>(After->front()));
10845
10846 Instruction *IndVar = getIndVar();
10847 assert(IndVar && "Canonical induction variable not found?");
10848 assert(isa<IntegerType>(IndVar->getType()) &&
10849 "Induction variable must be an integer");
10850 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10851 "Induction variable must be a PHI in the loop header");
10852 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10853 assert(
10854 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10855 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10856
10857 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10858 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10859 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10860 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10861 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10862 ->isOne());
10863
10864 Value *TripCount = getTripCount();
10865 assert(TripCount && "Loop trip count not found?");
10866 assert(IndVar->getType() == TripCount->getType() &&
10867 "Trip count and induction variable must have the same type");
10868
10869 auto *CmpI = cast<CmpInst>(&Cond->front());
10870 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10871 "Exit condition must be a signed less-than comparison");
10872 assert(CmpI->getOperand(0) == IndVar &&
10873 "Exit condition must compare the induction variable");
10874 assert(CmpI->getOperand(1) == TripCount &&
10875 "Exit condition must compare with the trip count");
10876#endif
10877}
10878
10879void CanonicalLoopInfo::invalidate() {
10880 Header = nullptr;
10881 Cond = nullptr;
10882 Latch = nullptr;
10883 Exit = nullptr;
10884}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Value * removeASCastIfPresent(Value *V)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn)
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:678
@ ICMP_SLT
signed less than
Definition InstrTypes.h:707
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:708
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:684
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:682
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:701
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:705
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:703
@ ICMP_NE
not equal
Definition InstrTypes.h:700
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:704
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:535
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:523
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2780
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1077
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1439
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1561
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:607
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:63
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1749
iterator_range< op_iterator > operands()
Definition Metadata.h:1845
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:247
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:255
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1040
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1102
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:411
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1118
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:133
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:149
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:130
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:355
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:85
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
LLVM_ABI std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition Path.cpp:787
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:318
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1705
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1657
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:294
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:738
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2116
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:682
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1719
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:126
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...