Reach test success parity with main

chriselrod · chriselrod · commit cf33a1dd55f3 · 2023-04-21T15:54:40.000-04:00
remarks test answers on main are junk.
diff --git a/include/ControlFlowMerging.hpp b/include/ControlFlowMerging.hpp
@@ -7,11 +7,11 @@
 #include "BitSets.hpp"
 #include "Containers/BumpMapSet.hpp"
 #include "Utilities/Allocators.hpp"
+#include <Math/BumpVector.hpp>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <llvm/ADT/ArrayRef.h>
-
 #include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/IR/BasicBlock.h>
@@ -21,8 +21,8 @@
 #include <set>
 #include <sys/select.h>
 
-void buildInstructionGraph(BumpAlloc<> &alloc, Instruction::Cache &cache,
-                           LinearProgramLoopBlock &LB) {
+inline void buildInstructionGraph(BumpAlloc<> &alloc, Instruction::Cache &cache,
+                                  LinearProgramLoopBlock &LB) {
   for (auto &node : LB.getNodes()) {
     auto access = node.getMemAccesses(alloc, LB.getMemoryAccesses());
     for (auto *mem : access) {
@@ -45,11 +45,11 @@ inline void merge(aset<Instruction *> &merged, aset<Instruction *> &toMerge) {
 }
 struct ReMapper {
   map<Instruction *, Instruction *> reMap;
-  auto operator[](Instruction *I) -> Instruction * {
-    if (auto f = reMap.find(I); f != reMap.end()) return f->second;
-    return I;
+  auto operator[](Instruction *J) -> Instruction * {
+    if (auto f = reMap.find(J); f != reMap.end()) return f->second;
+    return J;
   }
-  void remapFromTo(Instruction *I, Instruction *J) { reMap[I] = J; }
+  void remapFromTo(Instruction *K, Instruction *J) { reMap[K] = J; }
 };
 
 // represents the cost of merging key=>values; cost is hopefully negative.
@@ -119,10 +119,10 @@ struct MergingCost {
   // however, isMerged(I, J) == isMerged(J, I)
   // so we ignore easily swappable parameters
   // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
-  auto isMerged(Instruction *I, Instruction *J) const -> bool {
+  auto isMerged(Instruction *L, Instruction *J) const -> bool {
     Instruction *K = J;
     do {
-      if (I == K) return true;
+      if (L == K) return true;
       K = findMerge(K);
     } while (K && K != J);
     return false;
@@ -160,10 +160,8 @@ struct MergingCost {
     BumpAlloc<> &alloc;
     Instruction::Cache &cache;
     ReMapper &reMap;
-    llvm::MutableArrayRef<Instruction *> operands;
-    constexpr operator llvm::MutableArrayRef<Instruction *>() const {
-      return operands;
-    }
+    MutPtrVector<Instruction *> operands;
+    constexpr operator MutPtrVector<Instruction *>() const { return operands; }
     void merge(size_t i, Instruction *A, Instruction *B) {
       operands[i] = reMap[A]->replaceAllUsesOf(reMap[B]);
     }
@@ -178,7 +176,7 @@ struct MergingCost {
   static auto init(Allocate a, Instruction *A) -> SelectAllocator {
     size_t numOps = A->getNumOperands();
     auto **operandsPtr = a.alloc.allocate<Instruction *>(numOps);
-    llvm::MutableArrayRef<Instruction *> operands(operandsPtr, numOps);
+    MutPtrVector<Instruction *> operands{operandsPtr, numOps};
     return SelectAllocator{a.alloc, a.cache, a.reMap, operands};
   }
   static auto init(Count, Instruction *) -> SelectCounter {
@@ -212,8 +210,8 @@ struct MergingCost {
     // so we need to check if any operand pairs are merged with each other.
     // note `isMerged(a,a) == true`, so that's the one query we need to use.
     auto selector = init(selects, A);
-    llvm::MutableArrayRef<Instruction *> operandsA = A->getOperands();
-    llvm::MutableArrayRef<Instruction *> operandsB = B->getOperands();
+    MutPtrVector<Instruction *> operandsA = A->getOperands();
+    MutPtrVector<Instruction *> operandsB = B->getOperands();
     size_t numOperands = operandsA.size();
     assert(numOperands == operandsB.size());
     uint8_t associativeOpsFlag = B->associativeOperandsFlag();
@@ -316,20 +314,20 @@ struct MergingCost {
   }
 };
 
-void mergeInstructions(
+inline void mergeInstructions(
   BumpAlloc<> &alloc, Instruction::Cache &cache, Predicate::Map &predMap,
   llvm::TargetTransformInfo &TTI, unsigned int vectorBits,
   amap<std::pair<Instruction::Intrinsic, llvm::Type *>,
-       llvm::SmallVector<std::pair<Instruction *, Predicate::Set>>> &opMap,
-  llvm::SmallVectorImpl<MergingCost *> &mergingCosts, Instruction *I,
+       BumpPtrVector<std::pair<Instruction *, Predicate::Set>>> &opMap,
+  llvm::SmallVectorImpl<MergingCost *> &mergingCosts, Instruction *J,
   llvm::BasicBlock *BB, Predicate::Set &preds) {
   // have we already visited?
-  if (mergingCosts.front()->visited(I)) return;
+  if (mergingCosts.front()->visited(J)) return;
   for (auto C : mergingCosts) {
-    if (C->visited(I)) return;
-    C->initAncestors(alloc, I);
+    if (C->visited(J)) return;
+    C->initAncestors(alloc, J);
   }
-  auto op = I->getOpType();
+  auto op = J->getOpType();
   // TODO: confirm that `vec` doesn't get moved if `opMap` is resized
   auto &vec = opMap[op];
   // consider merging with every instruction sharing an opcode
@@ -351,19 +349,19 @@ void mergeInstructions(
     // invalidation, we use an indexed loop
     for (size_t i = 0; i < numMerges; ++i) {
       MergingCost *C = mergingCosts[i];
-      if (C->getAncestors(I)->contains(other)) continue;
+      if (C->getAncestors(J)->contains(other)) continue;
       // we shouldn't have to check the opposite condition
       // if (C->getAncestors(other)->contains(I))
       // because we are traversing in topological order
       // that is, we haven't visited any descendants of `I`
       // so only an ancestor had a chance
       auto *MC = alloc.construct<MergingCost>(*C);
       // MC is a copy of C, except we're now merging
-      MC->merge(alloc, TTI, vectorBits, other, I);
+      MC->merge(alloc, TTI, vectorBits, other, J);
     }
   }
   // descendants aren't legal merge candidates, so check before merging
-  for (Instruction *U : I->getUsers()) {
+  for (Instruction *U : J->getUsers()) {
     if (llvm::BasicBlock *BBU = U->getBasicBlock()) {
       if (BBU == BB) {
         // fast path, skip lookup
@@ -376,7 +374,7 @@ void mergeInstructions(
     }
   }
   // descendants aren't legal merge candidates, so push after merging
-  vec.push_back({I, preds});
+  vec.push_back({J, preds});
   // TODO: prune bad candidates from mergingCosts
 }
 
@@ -391,21 +389,22 @@ void mergeInstructions(
 /// merging as it allocates a lot of memory that it can free when it is done.
 /// TODO: this algorithm is exponential in time and memory.
 /// Odds are that there's way smarter things we can do.
-void mergeInstructions(BumpAlloc<> &alloc, Instruction::Cache &cache,
-                       Predicate::Map &predMap, llvm::TargetTransformInfo &TTI,
-                       BumpAlloc<> &tAlloc, unsigned int vectorBits) {
+inline void mergeInstructions(BumpAlloc<> &alloc, Instruction::Cache &cache,
+                              Predicate::Map &predMap,
+                              llvm::TargetTransformInfo &TTI,
+                              BumpAlloc<> &tAlloc, unsigned int vectorBits) {
   if (!predMap.isDivergent()) return;
   // there is a divergence in the control flow that we can ideally merge
   amap<std::pair<Instruction::Intrinsic, llvm::Type *>,
-       llvm::SmallVector<std::pair<Instruction *, Predicate::Set>>>
-    opMap{};
+       BumpPtrVector<std::pair<Instruction *, Predicate::Set>>>
+    opMap{tAlloc};
   llvm::SmallVector<MergingCost *> mergingCosts;
-  mergingCosts.push_back(alloc.construct<MergingCost>());
+  mergingCosts.emplace_back(alloc);
   for (auto &pred : predMap) {
     for (llvm::Instruction &lI : *pred.first) {
-      if (Instruction *I = cache[&lI]) {
+      if (Instruction *J = cache[&lI]) {
         mergeInstructions(tAlloc, cache, predMap, TTI, vectorBits, opMap,
-                          mergingCosts, I, pred.first, pred.second);
+                          mergingCosts, J, pred.first, pred.second);
       }
     }
   }
@@ -421,19 +420,19 @@ void mergeInstructions(BumpAlloc<> &alloc, Instruction::Cache &cache,
     auto [A, B] = pair;
     A = reMap[A];
     B = reMap[B];
-    llvm::MutableArrayRef<Instruction *> operands =
-      minCostStrategy->mergeOperands(
-        A, B, MergingCost::Allocate{alloc, cache, reMap});
+    auto operands = minCostStrategy->mergeOperands(
+      A, B, MergingCost::Allocate{alloc, cache, reMap});
     A->replaceAllUsesOf(B)->setOperands(operands);
     reMap.remapFromTo(B, A);
   }
   // free memory
   tAlloc.reset();
 }
 
-void mergeInstructions(BumpAlloc<> &alloc, Instruction::Cache &cache,
-                       LoopTree *loopForest, llvm::TargetTransformInfo &TTI,
-                       BumpAlloc<> &tAlloc, unsigned int vectorBits) {
+inline void mergeInstructions(BumpAlloc<> &alloc, Instruction::Cache &cache,
+                              LoopTree *loopForest,
+                              llvm::TargetTransformInfo &TTI,
+                              BumpAlloc<> &tAlloc, unsigned int vectorBits) {
   for (auto &predMap : loopForest->getPaths())
     mergeInstructions(alloc, cache, predMap, TTI, tAlloc, vectorBits);
   for (auto subLoop : loopForest->getSubLoops())
diff --git a/include/Instruction.hpp b/include/Instruction.hpp
@@ -173,7 +173,7 @@ struct Instruction {
   [[no_unique_address]] LinAlg::BumpPtrVector<RecipThroughputLatency> costs;
 
   void setOperands(MutPtrVector<Instruction *> ops) {
-    operands = ops;
+    operands << ops;
     for (auto op : ops) op->users.insert(this);
   }
 
@@ -1311,6 +1311,7 @@ struct Map {
   }
 
 }; // struct Map
+
 } // namespace Predicate
 
 inline auto Instruction::Cache::getInstruction(BumpAlloc<> &alloc,
diff --git a/include/LoopBlock.hpp b/include/LoopBlock.hpp
@@ -71,8 +71,7 @@ struct ScheduledNode {
     addMemory(sId, store, nodeIndex);
   }
   [[nodiscard]] constexpr auto
-  getMemAccesses(BumpAlloc<> &alloc,
-                 llvm::ArrayRef<MemoryAccess *> memAccess) const
+  getMemAccesses(BumpAlloc<> &alloc, PtrVector<MemoryAccess *> memAccess) const
     -> Vector<Address *> {
     // First, we invert the schedule matrix.
     SquarePtrMatrix<int64_t> Phi = schedule.getPhi();
@@ -110,6 +109,7 @@ struct ScheduledNode {
   constexpr void addInNeighbor(unsigned int i) { inNeighbors.insert(i); }
   constexpr void init(BumpAlloc<> &alloc) {
     schedule = AffineSchedule(alloc, getNumLoops());
+    schedule.getFusionOmega() << 0;
   }
   constexpr void addMemory(unsigned memId, MemoryAccess *mem,
                            unsigned nodeIndex) {
@@ -1042,7 +1042,7 @@ class LinearProgramLoopBlock {
     for (auto &&node : nodes) {
       if (depth >= node.getNumLoops()) continue;
       if (!hasActiveEdges(g, node)) {
-        node.getOffsetOmega()[depth] = std::numeric_limits<int64_t>::min();
+        node.getOffsetOmega(depth) = std::numeric_limits<int64_t>::min();
         if (!node.phiIsScheduled(depth))
           node.getSchedule(depth) << std::numeric_limits<int64_t>::min();
         continue;
@@ -1219,7 +1219,7 @@ class LinearProgramLoopBlock {
       Graph &gi = graphs[i];
       if (!canFuse(*gp, gi, d)) {
         // do not fuse
-        for (auto &&v : *gp) v.getFusionOmega()[d] = unfusedOffset;
+        for (auto &&v : *gp) v.getFusionOmega(d) = unfusedOffset;
         ++unfusedOffset;
         // gi is the new base graph
         gp = &gi;
@@ -1228,7 +1228,7 @@ class LinearProgramLoopBlock {
         (*gp) |= gi;
     }
     // set omegas for gp
-    for (auto &&v : *gp) v.getFusionOmega()[d] = unfusedOffset;
+    for (auto &&v : *gp) v.getFusionOmega(d) = unfusedOffset;
     ++d;
     // size_t numSat = satDeps.size();
     for (auto i : baseGraphs)
diff --git a/include/LoopForest.hpp b/include/LoopForest.hpp
@@ -8,7 +8,6 @@
 #include <cstddef>
 #include <iterator>
 #include <limits>
-
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/Analysis/ScalarEvolution.h>
@@ -36,6 +35,10 @@ struct LoopTree {
   [[no_unique_address]] Optional<LoopTree *> parentLoop{nullptr};
   [[no_unique_address]] llvm::SmallVector<NotNull<MemoryAccess>> memAccesses{};
 
+  ~LoopTree() {
+    for (auto subLoop : subLoops) subLoop->~LoopTree();
+  }
+
   auto getPaths() -> llvm::MutableArrayRef<Predicate::Map> { return paths; }
   auto getPaths() const -> llvm::ArrayRef<Predicate::Map> { return paths; }
   auto getSubLoops() -> llvm::MutableArrayRef<NotNull<LoopTree>> {
@@ -56,8 +59,8 @@ struct LoopTree {
   }
   // LoopTree(const LoopTree &) = default;
   // LoopTree(LoopTree &&) = default;
-  auto operator=(const LoopTree &) -> LoopTree & = default;
-  auto operator=(LoopTree &&) -> LoopTree & = default;
+  auto operator=(const LoopTree &) -> LoopTree & = delete;
+  auto operator=(LoopTree &&) -> LoopTree & = delete;
   LoopTree(llvm::SmallVector<NotNull<LoopTree>> sL,
            llvm::SmallVector<Predicate::Map> pth)
     : loop(nullptr), subLoops(std::move(sL)), paths(std::move(pth)) {}
@@ -109,9 +112,9 @@ struct LoopTree {
   [[nodiscard]] auto size() const -> size_t { return subLoops.size(); }
 
   static void split(BumpAlloc<> &alloc,
-                    llvm::SmallVectorImpl<NotNull<LoopTree>> &trees,
-                    llvm::SmallVectorImpl<Predicate::Map> &paths,
-                    llvm::SmallVectorImpl<NotNull<LoopTree>> &subTree) {
+                    llvm::SmallVector<NotNull<LoopTree>> &trees,
+                    llvm::SmallVector<Predicate::Map> &paths,
+                    llvm::SmallVector<NotNull<LoopTree>> &subTree) {
     if (subTree.size()) {
       assert(1 + subTree.size() == paths.size());
       auto *newTree =
diff --git a/include/Loops.hpp b/include/Loops.hpp
@@ -509,7 +509,7 @@ struct AffineLoopNest
           for (size_t i = innermostLoopInd; i < numToRemove + innermostLoopInd;
                ++i)
             A(m, i) = A(m, i + numRemainingLoops);
-          A(m, _(numToRemove + innermostLoopInd, N)) = tmp;
+          A(m, _(numToRemove + innermostLoopInd, N)) << tmp;
         }
       }
     } else
diff --git a/include/Math/Array.hpp b/include/Math/Array.hpp
@@ -283,7 +283,8 @@ struct MutArray : Array<T, S>, ArrayOps<T, S, MutArray<T, S>> {
 
   constexpr MutArray(const MutArray &) = default;
   constexpr MutArray(MutArray &&) noexcept = default;
-  constexpr auto operator=(const MutArray &) -> MutArray & = default;
+  constexpr auto operator=(const MutArray &) -> MutArray & = delete;
+  // constexpr auto operator=(const MutArray &) -> MutArray & = default;
   constexpr auto operator=(MutArray &&) noexcept -> MutArray & = default;
 
   constexpr void truncate(S nz) {
@@ -851,7 +852,8 @@ struct ReallocView : ResizeableView<T, S, U> {
 #else
     T *newPtr = allocator.allocate(newCapacity);
 #endif
-    if (U oldLen = U(this->sz)) std::copy_n(this->data(), oldLen, newPtr);
+    if (U oldLen = U(this->sz))
+      std::uninitialized_copy_n(this->data(), oldLen, newPtr);
     maybeDeallocate(newPtr, newCapacity);
   }
   [[nodiscard]] constexpr auto get_allocator() const noexcept -> A {
diff --git a/include/Math/BumpVector.hpp b/include/Math/BumpVector.hpp
@@ -273,3 +273,4 @@ template <typename T, unsigned InitialCapacity = 8> struct BumpPtrVector {
 static_assert(std::is_trivially_destructible_v<MutPtrVector<int64_t>>);
 static_assert(std::is_trivially_destructible_v<BumpPtrVector<int64_t>>);
 } // namespace LinAlg
+using LinAlg::BumpPtrVector;
diff --git a/include/Math/Constraints.hpp b/include/Math/Constraints.hpp
@@ -254,7 +254,7 @@ constexpr void slackEqualityConstraints(MutPtrMatrix<int64_t> C,
 // counts how many negative and positive elements there are in row `i`.
 // A row corresponds to a particular variable in `A'x <= b`.
 constexpr auto countNonZeroSign(DensePtrMatrix<int64_t> A, size_t i)
-  -> std::pair<size_t, size_t> {
+  -> std::array<size_t, 2> {
   size_t numNeg = 0;
   size_t numPos = 0;
   Row numRow = A.numRow();
@@ -263,7 +263,7 @@ constexpr auto countNonZeroSign(DensePtrMatrix<int64_t> A, size_t i)
     numNeg += (Aij < 0);
     numPos += (Aij > 0);
   }
-  return std::make_pair(numNeg, numPos);
+  return {numNeg, numPos};
 }
 
 /// x == 0 -> 0, x < 0 -> 1, x > 0 -> 2
diff --git a/include/TurboLoop.hpp b/include/TurboLoop.hpp
diff --git a/include/Utilities/Allocators.hpp b/include/Utilities/Allocators.hpp

Original file line number	Diff line number	Diff line change
`@@ -173,7 +173,7 @@ struct Instruction {`
`173`	`173`	`[[no_unique_address]] LinAlg::BumpPtrVector<RecipThroughputLatency> costs;`
`174`	`174`
`175`	`175`	`void setOperands(MutPtrVector<Instruction *> ops) {`
`176`		`- operands = ops;`
	`176`	`+ operands << ops;`
`177`	`177`	`for (auto op : ops) op->users.insert(this);`
`178`	`178`	`}`
`179`	`179`
`@@ -1311,6 +1311,7 @@ struct Map {`
`1311`	`1311`	`}`
`1312`	`1312`
`1313`	`1313`	`}; // struct Map`
	`1314`	`+`
`1314`	`1315`	`} // namespace Predicate`
`1315`	`1316`
`1316`	`1317`	`inline auto Instruction::Cache::getInstruction(BumpAlloc<> &alloc,`
Original file line number	Diff line number	Diff line change
`@@ -509,7 +509,7 @@ struct AffineLoopNest`
`509`	`509`	`for (size_t i = innermostLoopInd; i < numToRemove + innermostLoopInd;`
`510`	`510`	`++i)`
`511`	`511`	`A(m, i) = A(m, i + numRemainingLoops);`
`512`		`- A(m, _(numToRemove + innermostLoopInd, N)) = tmp;`
	`512`	`+ A(m, _(numToRemove + innermostLoopInd, N)) << tmp;`
`513`	`513`	`}`
`514`	`514`	`}`
`515`	`515`	`} else`