diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a97f27093..ae8f287a8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,9 +10,9 @@ jobs:
         sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
         sudo wget https://apt.llvm.org/llvm.sh
         sudo chmod +x llvm.sh
-        sudo ./llvm.sh 16 all
+        sudo ./llvm.sh 17 all
         sudo apt install g++-12 libgtest-dev ninja-build pkg-config cmake gcovr
-        sudo ln -s $(which opt-16) /usr/local/bin/opt
+        sudo ln -s $(which opt-17) /usr/local/bin/opt
     - run: cmake -G Ninja -S test -B build/test -DUSE_SANITIZER='Undefined' -DENABLE_TEST_COVERAGE=1 -DCMAKE_BUILD_TYPE=Debug -DENABLE_LLD=OFF
       env:
         CXX: g++-12
@@ -22,7 +22,7 @@ jobs:
         CTEST_OUTPUT_ON_FAILURE: 1
     - run: cmake -G Ninja -S test -B builddirclang/test -DCMAKE_BUILD_TYPE=RelWithDebInfo -DUSE_SANITIZER='Undefined' -DCMAKE_PREFIX_PATH=/usr/local -DENABLE_LLD=OFF
       env:
-        CXX: clang++-16
+        CXX: clang++-17
     - run: cmake --build builddirclang/test
     - run: cmake --build builddirclang/test --target test
       env:
@@ -37,7 +37,7 @@ jobs:
   #   - uses: actions/checkout@v3
   #   - run: sudo xcode-select --switch /Library/Developer/CommandLineTools
   #   - run: echo $(pkgutil --pkg-info=com.apple.pkg.CLTools_Executables)
-  #   - run: brew install llvm@16 ninja pkg-config cmake gcovr # gcc
+  #   - run: brew install llvm@17 ninja pkg-config cmake gcovr # gcc
   #   - run: echo "/usr/local/opt/llvm/bin" >> $GITHUB_PATH
   #   - run: echo $(which clang++)
   #   - run: cmake -G Ninja -S test -B build/test -DUSE_SANITIZER='Undefined' -DCMAKE_PREFIX_PATH=/usr/local  -DCMAKE_BUILD_TYPE=Debug -DENABLE_LLD=OFF
diff --git a/.gitignore b/.gitignore
index 5461dd8e0..f5a066a46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@ latex/
 html/
 coverage.*
 coverage-final.json
-Testing
\ No newline at end of file
+Testing
+compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf5ec5a04..663dacf44 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,7 +45,7 @@ CPMAddPackage("gh:TheLartians/PackageProject.cmake@1.8.0")
 # clang;clang-tools-extra;lld;lldb;polly;pstl" "LLVM_ENABLE_RUNTIMES all" "LLVM_ENABLE_RTTI OFF"
 # "BUILD_SHARED_LIBS OFF" "LLVM_CCACHE_BUILD ON"  "LLVM_OPTIMIZED_TABLEGEN ON" "LLVM_ENABLE_LTO ON"
 # "LLVM_ENABLE_Z3_SOLVER ON" )
-find_package(LLVM 16 REQUIRED CONFIG)
+find_package(LLVM 17 REQUIRED CONFIG)
 list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
 # include(AddLLVM)
 include(${LLVM_DIR}/AddLLVM.cmake)
@@ -85,7 +85,7 @@ add_library(${PROJECT_NAME} MODULE ${headers} ${sources})
 set(CXX_STANDARD_REQUIRED ON)
 set_target_properties(
   ${PROJECT_NAME}
-  PROPERTIES CXX_STANDARD 20
+  PROPERTIES CXX_STANDARD 23
              CXX_VISIBILITY_PRESET hidden
              VISIBILITY_INLINES_HIDDEN ON
 )
@@ -186,5 +186,5 @@ packageProject(
   INCLUDE_DESTINATION include/${PROJECT_NAME}-${PROJECT_VERSION}
   VERSION_HEADER "${VERSION_HEADER_LOCATION}"
   COMPATIBILITY SameMinorVersion
-  DEPENDENCIES "LLVM 15.0.6"
+  DEPENDENCIES "LLVM 17.0.1"
 )
diff --git a/Doxyfile b/Doxyfile
index cbabce903..709946500 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -86,7 +86,7 @@ CREATE_SUBDIRS         = NO
 # level increment doubles the number of directories, resulting in 4096
 # directories at level 8 which is the default and also the maximum value. The
 # sub-directories are organized in 2 levels, the first level always has a fixed
-# numer of 16 directories.
+# number of 16 directories.
 # Minimum value: 0, maximum value: 8, default value: 8.
 # This tag requires that the tag CREATE_SUBDIRS is set to YES.
 
diff --git a/LICENSE b/LICENSE
index 94472c35c..2c88274e8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,222 +1,26 @@
-==============================================================================
-The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
-==============================================================================
+MIT License
 
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
+Copyright (c) 2021-2024: Chris Elrod, Yingbo Ma, JuliaHub, and other contributors: https://github.com/LoopModels/LoopModels/graphs/contributors
 
-    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
 
-    1. Definitions.
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
 
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
+end of terms and conditions
 
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-    2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-    3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-    4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-    5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-    6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-    7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-    8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-    9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-    END OF TERMS AND CONDITIONS
-
-    APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-    Copyright [yyyy] [name of copyright owner]
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-
----- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
+Please see [THIRDPARTY.md](./THIRDPARTY.md) for license information for other software used in this project.
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 5b707b3bc..13ffd9b66 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.23)
 project(LoopModelsBenchmarks LANGUAGES C CXX)
 
 option(ENABLE_NATIVE_COMPILATION "Compile with -march=native" ON)
+option(ENABLE_OPENMP "Use OpenMP for a multithreading benchmark" OFF)
 
 # --- Import tools ----
 
@@ -33,10 +34,17 @@ CPMAddPackage(
   GIT_TAG fnoexceptions SYSTEM TRUE
 )
 
+FetchContent_Declare(
+  Math
+  GIT_REPOSITORY git@github.com:LoopModels/Math.git
+  GIT_TAG origin/main
+)
+FetchContent_MakeAvailable(Math)
+
 # file(GLOB_RECURSE headers CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/include/*.hpp)
 file(GLOB benchmarks CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 
-find_package(LLVM 16 REQUIRED CONFIG)
+find_package(LLVM 17 REQUIRED CONFIG)
 list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
 include(${LLVM_DIR}/AddLLVM.cmake)
 # message(STATUS "headers: ${headers}") add_executable(${PROJECT_NAME} ${headers} ${benchmarks})
@@ -50,10 +58,12 @@ target_include_directories(${PROJECT_NAME} SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
 target_include_directories(
   ${PROJECT_NAME} PRIVATE ${PROJECT_SOURCE_DIR}/../include ${PROJECT_SOURCE_DIR}/include
 )
-find_package(OpenMP)
+if(ENABLE_OPENMP)
+  find_package(OpenMP)
+  target_link_libraries(${PROJECT_NAME} OpenMP::OpenMP_CXX)
+endif()
 target_link_libraries(
-  ${PROJECT_NAME} PRIVATE benchmark::benchmark LLVM unordered_dense::unordered_dense
-                          OpenMP::OpenMP_CXX
+  ${PROJECT_NAME} PRIVATE benchmark::benchmark LLVM unordered_dense::unordered_dense Math
 )
 
 if((CMAKE_CXX_COMPILER_ID MATCHES "Clang") OR (CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM"))
@@ -78,7 +88,7 @@ if(ENABLE_NATIVE_COMPILATION)
 endif()
 set_target_properties(
   ${PROJECT_NAME}
-  PROPERTIES CXX_STANDARD 20
+  PROPERTIES CXX_STANDARD 23
              CXX_VISIBILITY_PRESET hidden
              VISIBILITY_INLINES_HIDDEN ON
 )
@@ -103,10 +113,12 @@ target_compile_options(
           -Wextra
           -save-temps
 )
-if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
-  target_compile_options(${PROJECT_NAME} PRIVATE -fiopenmp)
-else()
-  target_compile_options(${PROJECT_NAME} PRIVATE -fopenmp)
+if(ENABLE_OPENMP)
+  if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
+    target_compile_options(${PROJECT_NAME} PRIVATE -fiopenmp)
+  else()
+    target_compile_options(${PROJECT_NAME} PRIVATE -fopenmp)
+  endif()
 endif()
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
   target_compile_options(${PROJECT_NAME} PRIVATE -masm=intel)
diff --git a/benchmark/include/constraint_pruning_benchmark.hpp b/benchmark/include/constraint_pruning_benchmark.hpp
index 06210a395..e2a2a0493 100644
--- a/benchmark/include/constraint_pruning_benchmark.hpp
+++ b/benchmark/include/constraint_pruning_benchmark.hpp
@@ -1,63 +1,66 @@
 #pragma once
-#include "Math/NormalForm.hpp"
-#include "Math/Orthogonalize.hpp"
-#include "MatrixStringParse.hpp"
+#include <Math/NormalForm.hpp>
+#include <Math/Orthogonalize.hpp>
+#include <Utilities/MatrixStringParse.hpp>
 #include <benchmark/benchmark.h>
 #include <cstddef>
 #include <cstdint>
 #include <llvm/ADT/SmallVector.h>
 
+using poly::math::Vector, poly::math::IntMatrix, poly::math::Row,
+  poly::math::Col, poly::math::DenseDims, poly::math::_,
+  poly::utils::operator""_mat;
 static void BM_NullSpace(benchmark::State &state) {
 
-  IntMatrix B(DenseDims{Row{6}, Col{3}});
-  B(0, 0) = 1;
-  B(1, 0) = 0;
-  B(2, 0) = -3;
-  B(3, 0) = 0;
-  B(4, 0) = 2;
-  B(5, 0) = -8;
+  IntMatrix<> B(poly::math::DenseDims{Row<>{6}, Col<>{3}});
+  B[0, 0] = 1;
+  B[1, 0] = 0;
+  B[2, 0] = -3;
+  B[3, 0] = 0;
+  B[4, 0] = 2;
+  B[5, 0] = -8;
 
-  B(0, 1) = 0;
-  B(1, 1) = 1;
-  B(2, 1) = 5;
-  B(3, 1) = 0;
-  B(4, 1) = -1;
-  B(5, 1) = 4;
+  B[0, 1] = 0;
+  B[1, 1] = 1;
+  B[2, 1] = 5;
+  B[3, 1] = 0;
+  B[4, 1] = -1;
+  B[5, 1] = 4;
 
-  B(0, 2) = 0;
-  B(1, 2) = 0;
-  B(2, 2) = 0;
-  B(3, 2) = 1;
-  B(4, 2) = 7;
-  B(5, 2) = -9;
+  B[0, 2] = 0;
+  B[1, 2] = 0;
+  B[2, 2] = 0;
+  B[3, 2] = 1;
+  B[4, 2] = 7;
+  B[5, 2] = -9;
 
   // fourth row is 0
   // std::cout << "B=\n" << B << "\nnullSpace(B) =\n" <<
   // NormalForm::nullSpace(B) << std::endl;
-  IntMatrix A;
-  for (auto b : state) A = NormalForm::nullSpace(B);
+  poly::math::IntMatrix<> A;
+  for (auto b : state) A = poly::math::NormalForm::nullSpace(B);
 }
 // Register the function as a benchmark
 BENCHMARK(BM_NullSpace);
 
 static void BM_NullSpace2000(benchmark::State &state) {
   const size_t N = 20;
-  IntMatrix A(DenseDims{Row{N}, Col{N}});
+  IntMatrix<> A(DenseDims{Row<>{N}, Col<>{N}});
   A << 0;
-  A(0, 0) = 2;
+  A[0, 0] = 2;
   for (size_t i = 1; i < N; ++i) {
-    A(i - 1, i) = -1;
-    A(i, i) = 2;
-    A(i, i - 1) = -1;
+    A[i - 1, i] = -1;
+    A[i, i] = 2;
+    A[i, i - 1] = -1;
   }
   for (size_t j = 0; j < N; j += 8) {
-    A(j, _) << 0;
-    for (size_t i = 0; i < N; i += 7) A(j, _) += ((i & 1) ? 1 : -1) * A(i, _);
+    A[j, _] << 0;
+    for (size_t i = 0; i < N; i += 7) A[j, _] += ((i & 1) ? 1 : -1) * A[i, _];
   }
 
   // fourth row is 0
-  IntMatrix NS;
-  for (auto b : state) NS = NormalForm::nullSpace(A);
+  IntMatrix<> NS;
+  for (auto b : state) NS = poly::math::NormalForm::nullSpace(A);
   // std::cout << "NS.size() = (" << NS.numRow() << ", " << NS.numCol() << ")"
   //           << std::endl;
 }
@@ -65,39 +68,38 @@ static void BM_NullSpace2000(benchmark::State &state) {
 BENCHMARK(BM_NullSpace2000);
 
 static void BM_Orthogonalize(benchmark::State &state) {
-  IntMatrix A =
+  IntMatrix<> A =
     "[-2 2 0 1 1 1 2; 3 -3 2 3 2 3 2; -3 0 2 3 -2 0 1; 2 1 0 -1 3 -1 1; 1 -3 -3 -2 2 -2 2; 0 0 1 2 -3 -2 -2; 0 -3 -2 -1 1 0 1]"_mat;
-  IntMatrix B;
+  IntMatrix<> B;
   for (auto b : state) B = orthogonalize(A);
 }
 BENCHMARK(BM_Orthogonalize);
 
 static void BM_Bareiss2000(benchmark::State &state) {
   const size_t N = 20;
-  IntMatrix A(DenseDims{Row{N}, Col{N}});
+  IntMatrix<> A(DenseDims{Row<>{N}, Col<>{N}});
   A << 0;
-  A(0, 0) = 2;
+  A[0, 0] = 2;
   for (size_t i = 1; i < N; ++i) {
-    A(i - 1, i) = -1;
-    A(i, i) = 2;
-    A(i, i - 1) = -1;
+    A[i - 1, i] = -1;
+    A[i, i] = 2;
+    A[i, i - 1] = -1;
   }
   for (size_t j = 0; j < N; j += 8) {
-    // A(j,:)
-    for (size_t i = 0; i < N; ++i) A(j, i) = 0;
+    A[j, _] << 0;
     for (size_t i = 0; i < N; i += 7) {
       int64_t s = (i & 1) ? 1 : -1;
-      for (size_t k = 0; k < N; ++k) A(j, k) += s * A(i, k);
+      A[j, _] += s * A[i, _];
     }
   }
   // std::cout << A << std::endl;
 
   // fourth row is 0
-  Vector<size_t> pivots(N);
-  IntMatrix B;
+  Vector<ptrdiff_t> pivots(N);
+  IntMatrix<> B;
   for (auto b : state) {
     B = A;
-    NormalForm::bareiss(B, pivots);
+    poly::math::NormalForm::bareiss(B, pivots);
   }
   // std::cout << "NS.size() = (" << NS.numRow() << ", " << NS.numCol() << ")"
   //           << std::endl;
diff --git a/benchmark/include/map_benchmark.hpp b/benchmark/include/map_benchmark.hpp
index b1cca726f..b385e4edb 100644
--- a/benchmark/include/map_benchmark.hpp
+++ b/benchmark/include/map_benchmark.hpp
@@ -1,181 +1,222 @@
 #pragma once
 
-#include "Containers/BumpMapSet.hpp"
+#include "Alloc/Arena.hpp"
+#include "Dicts/BumpMapSet.hpp"
 #include "Dicts/BumpVector.hpp"
-#include "Utilities/Allocators.hpp"
+#include "Dicts/Trie.hpp"
 #include <ankerl/unordered_dense.h>
 #include <benchmark/benchmark.h>
-#include <cassert>
 #include <cstdint>
-#include <functional>
 #include <llvm/ADT/DenseMap.h>
 #include <random>
 #include <unordered_map>
 
+template <class D> struct TrieWrap {
+  D d;
+  poly::alloc::Arena<> *alloc;
+
+  template <class K> auto operator[](const K &k) -> auto & {
+    return d[alloc, k];
+  };
+  template <class K> void erase(const K &k) { d.erase(k); }
+};
+
+inline auto randvp(std::mt19937_64 &rng, uint64_t mask) {
+  return reinterpret_cast<void *>((rng() & mask) | 8);
+}
+
 template <typename D>
-void InsertLookup2(std::mt19937_64 &mt, D &map, uint64_t mask) {
+void InsertLookup2(std::mt19937_64 &rng, D &map, uint64_t mask) {
   for (uint64_t i = 0; i < 256; ++i) {
-    map[reinterpret_cast<void *>(mt() & mask)] +=
-      i + map[reinterpret_cast<void *>(mt() & mask)];
+    void *p0 = randvp(rng, mask);
+    void *p1 = randvp(rng, mask);
+    map[p0] += i + map[p1];
   }
 }
 
 template <typename D>
-void InsertErase(std::mt19937_64 &mt, D &map, uint64_t mask) {
+void InsertErase(std::mt19937_64 &rng, D &map, uint64_t mask) {
   for (uint64_t i = 0; i < 256; ++i) {
-    map[reinterpret_cast<void *>(mt() & mask)] = i;
-    map.erase(reinterpret_cast<void *>(mt() & mask));
+    void *p0 = randvp(rng, mask);
+    void *p1 = randvp(rng, mask);
+    map[p0] = i;
+    map.erase(p1);
   }
 }
 template <typename D>
-void InsertLookup3(std::mt19937_64 &mt, D &map, uint64_t mask) {
+void InsertLookup3(std::mt19937_64 &rng, D &map, uint64_t mask) {
   for (uint64_t i = 0; i < 256; ++i) {
-    map[reinterpret_cast<void *>(mt() & mask)] +=
-      map[reinterpret_cast<void *>(mt() & mask)] +
-      map[reinterpret_cast<void *>(mt() & mask)];
+    void *p0 = randvp(rng, mask);
+    void *p1 = randvp(rng, mask);
+    void *p2 = randvp(rng, mask);
+    map[p0] += map[p1] + map[p2];
   }
 }
 
 static void BM_llvmDenseMapInsertErase(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng{};
   for (auto b : state) {
     llvm::DenseMap<void *, uint64_t> map{};
-    InsertErase(mt, map, mask);
+    InsertErase(rng, map, mask);
   }
 }
 BENCHMARK(BM_llvmDenseMapInsertErase)->DenseRange(2, 8, 1);
 static void BM_llvmSmallDenseMapInsertErase(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     llvm::SmallDenseMap<void *, uint64_t> map{};
-    InsertErase(mt, map, mask);
+    InsertErase(rng, map, mask);
   }
 }
 BENCHMARK(BM_llvmSmallDenseMapInsertErase)->DenseRange(2, 8, 1);
 static void BM_BumpMapInsertErase(benchmark::State &state) {
-  OwningArena<> alloc;
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  poly::alloc::OwningArena<> alloc;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
-    amap<void *, uint64_t> map{alloc};
-    InsertErase(mt, map, mask);
+    poly::dict::amap<void *, uint64_t> map{&alloc};
+    InsertErase(rng, map, mask);
     alloc.reset();
   }
 }
 BENCHMARK(BM_BumpMapInsertErase)->DenseRange(2, 8, 1);
+static void BM_TrieInsertErase(benchmark::State &state) {
+  poly::alloc::OwningArena<> alloc;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
+  for (auto b : state) {
+    TrieWrap<poly::dict::TrieMap<true, void *, uint64_t>> map{{}, &alloc};
+    InsertErase(rng, map, mask);
+    alloc.reset();
+  }
+}
+BENCHMARK(BM_TrieInsertErase)->DenseRange(2, 8, 1);
+
+static void BM_InlineTrieInsertErase(benchmark::State &state) {
+  poly::alloc::OwningArena<> alloc;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
+  for (auto b : state) {
+    TrieWrap<poly::dict::InlineTrie<void *, uint64_t>> map{{}, &alloc};
+    InsertErase(rng, map, mask);
+    alloc.reset();
+  }
+}
+BENCHMARK(BM_InlineTrieInsertErase)->DenseRange(2, 8, 1);
+
 static void BM_ankerlMapInsertErase(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     ankerl::unordered_dense::map<void *, uint64_t> map;
-    InsertErase(mt, map, mask);
+    InsertErase(rng, map, mask);
   }
 }
 BENCHMARK(BM_ankerlMapInsertErase)->DenseRange(2, 8, 1);
 static void BM_stdUnorderedMapInsertErase(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     std::unordered_map<void *, uint64_t> map;
-    InsertErase(mt, map, mask);
+    InsertErase(rng, map, mask);
   }
 }
 BENCHMARK(BM_stdUnorderedMapInsertErase)->DenseRange(2, 8, 1);
 
 static void BM_llvmDenseMapInsertLookup(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     llvm::DenseMap<void *, uint64_t> map{};
-    InsertLookup2(mt, map, mask);
+    InsertLookup2(rng, map, mask);
   }
 }
 BENCHMARK(BM_llvmDenseMapInsertLookup)->DenseRange(2, 8, 1);
 static void BM_llvmSmallDenseMapInsertLookup(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     llvm::SmallDenseMap<void *, uint64_t> map{};
-    InsertLookup2(mt, map, mask);
+    InsertLookup2(rng, map, mask);
   }
 }
 BENCHMARK(BM_llvmSmallDenseMapInsertLookup)->DenseRange(2, 8, 1);
 static void BM_BumpMapInsertLookup(benchmark::State &state) {
-  OwningArena<> alloc;
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  poly::alloc::OwningArena<> alloc;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
-    amap<void *, uint64_t> map{alloc};
-    InsertLookup2(mt, map, mask);
+    poly::dict::amap<void *, uint64_t> map{&alloc};
+    InsertLookup2(rng, map, mask);
     alloc.reset();
   }
 }
 BENCHMARK(BM_BumpMapInsertLookup)->DenseRange(2, 8, 1);
 static void BM_ankerlMapInsertLookup(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     ankerl::unordered_dense::map<void *, uint64_t> map;
-    InsertLookup2(mt, map, mask);
+    InsertLookup2(rng, map, mask);
   }
 }
 BENCHMARK(BM_ankerlMapInsertLookup)->DenseRange(2, 8, 1);
 static void BM_stdUnorderedMapInsertLookup(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     std::unordered_map<void *, uint64_t> map;
-    InsertLookup2(mt, map, mask);
+    InsertLookup2(rng, map, mask);
   }
 }
 BENCHMARK(BM_stdUnorderedMapInsertLookup)->DenseRange(2, 8, 1);
 
 static void BM_llvmDenseMapInsertLookup3(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     llvm::DenseMap<void *, uint64_t> map{};
-    InsertLookup3(mt, map, mask);
+    InsertLookup3(rng, map, mask);
   }
 }
 BENCHMARK(BM_llvmDenseMapInsertLookup3)->DenseRange(2, 8, 1);
 static void BM_llvmSmallDenseMapInsertLookup3(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     llvm::SmallDenseMap<void *, uint64_t> map{};
-    InsertLookup3(mt, map, mask);
+    InsertLookup3(rng, map, mask);
   }
 }
 BENCHMARK(BM_llvmSmallDenseMapInsertLookup3)->DenseRange(2, 8, 1);
 static void BM_BumpMapInsertLookup3(benchmark::State &state) {
-  OwningArena<> alloc;
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  poly::alloc::OwningArena<> alloc;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
-    amap<void *, uint64_t> map{alloc};
-    InsertLookup3(mt, map, mask);
+    poly::dict::amap<void *, uint64_t> map{&alloc};
+    InsertLookup3(rng, map, mask);
     alloc.reset();
   }
 }
 BENCHMARK(BM_BumpMapInsertLookup3)->DenseRange(2, 8, 1);
 static void BM_ankerlMapInsertLookup3(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     ankerl::unordered_dense::map<void *, uint64_t> map;
-    InsertLookup3(mt, map, mask);
+    InsertLookup3(rng, map, mask);
   }
 }
 BENCHMARK(BM_ankerlMapInsertLookup3)->DenseRange(2, 8, 1);
 static void BM_stdUnorderedMapInsertLookup3(benchmark::State &state) {
-  uint64_t mask = ((1ull << state.range(0)) - 1) << 3ull;
-  std::mt19937_64 mt;
+  uint64_t mask = ((1ULL << state.range(0)) - 1) << 4ULL;
+  std::mt19937_64 rng;
   for (auto b : state) {
     std::unordered_map<void *, uint64_t> map;
-    InsertLookup3(mt, map, mask);
+    InsertLookup3(rng, map, mask);
   }
 }
 BENCHMARK(BM_stdUnorderedMapInsertLookup3)->DenseRange(2, 8, 1);
@@ -205,9 +246,9 @@ static void BM_llvmSmallDenseMapSeq(benchmark::State &state) {
 BENCHMARK(BM_llvmSmallDenseMapSeq)->RangeMultiplier(2)->Range(1 << 2, 1 << 10);
 
 static void BM_BumpMapSeq(benchmark::State &state) {
-  OwningArena<> alloc;
+  poly::alloc::OwningArena<> alloc;
   for (auto b : state) {
-    amap<void *, uint64_t> map{alloc};
+    poly::dict::amap<void *, uint64_t> map{&alloc};
     for (uint64_t i = 1; i <= uint64_t(state.range(0)); ++i)
       map[reinterpret_cast<void *>(8 * i)] = i;
     for (uint64_t i = 1; i <= uint64_t(state.range(0)); ++i)
diff --git a/benchmark/include/matrix_exp.hpp b/benchmark/include/matrix_exp.hpp
index 213499ad8..84ada1e17 100644
--- a/benchmark/include/matrix_exp.hpp
+++ b/benchmark/include/matrix_exp.hpp
@@ -1,11 +1,12 @@
 #pragma once
 
-#include "Containers/TinyVector.hpp"
-#include "Math/Array.hpp"
-#include "Math/LinearAlgebra.hpp"
-#include "Math/Matrix.hpp"
-#include "Math/StaticArrays.hpp"
-#include "Utilities/Invariant.hpp"
+#include <Containers/TinyVector.hpp>
+#include <Math/Array.hpp>
+#include <Math/Dual.hpp>
+#include <Math/LinearAlgebra.hpp>
+#include <Math/Matrix.hpp>
+#include <Math/StaticArrays.hpp>
+#include <Utilities/Invariant.hpp>
 #include <algorithm>
 #include <array>
 #include <benchmark/benchmark.h>
@@ -13,154 +14,28 @@
 #include <cstdint>
 #include <random>
 
-template <class T, size_t N> class Dual {
-  T val{};
-  SVector<T, N> partials{};
+using poly::math::Dual, poly::math::Vector, poly::containers::TinyVector,
+  poly::math::SquareMatrix, poly::math::AbstractMatrix, poly::math::SquareDims,
+  poly::math::I, poly::utils::eltype_t, poly::utils::invariant;
 
-public:
-  using val_type = T;
-  static constexpr size_t num_partials = N;
-  constexpr Dual() = default;
-  constexpr Dual(T v) : val(v) {}
-  constexpr Dual(T v, size_t n) : val(v) { partials[n] = T{1}; }
-  constexpr Dual(T v, SVector<T, N> g) : val(v), partials(g) {}
-  constexpr Dual(std::integral auto v) : val(v) {}
-  constexpr Dual(std::floating_point auto v) : val(v) {}
-  constexpr auto value() -> T & { return val; }
-  constexpr auto gradient() -> SVector<T, N> & { return partials; }
-  [[nodiscard]] constexpr auto value() const -> const T & { return val; }
-  [[nodiscard]] constexpr auto gradient() const -> const SVector<T, N> & {
-    return partials;
-  }
-  // constexpr auto operator[](size_t i) const -> T { return grad[i]; }
-  // constexpr auto operator[](size_t i) -> T & { return grad[i]; }
-  constexpr auto operator-() const -> Dual { return Dual(-val, -partials); }
-  constexpr auto operator+(const Dual &other) const -> Dual {
-    return {val + other.val, partials + other.partials};
-  }
-  constexpr auto operator-(const Dual &other) const -> Dual {
-    return {val - other.val, partials - other.partials};
-  }
-  constexpr auto operator*(const Dual &other) const -> Dual {
-    return {val * other.val, val * other.partials + other.val * partials};
-  }
-  constexpr auto operator/(const Dual &other) const -> Dual {
-    return {val / other.val, (other.val * partials - val * other.partials) /
-                               (other.val * other.val)};
-  }
-  constexpr auto operator+=(const Dual &other) -> Dual & {
-    val += other.val;
-    partials += other.partials;
-    return *this;
-  }
-  constexpr auto operator-=(const Dual &other) -> Dual & {
-    val -= other.val;
-    partials -= other.partials;
-    return *this;
-  }
-  constexpr auto operator*=(const Dual &other) -> Dual & {
-    val *= other.val;
-    partials = val * other.partials + other.val * partials;
-    return *this;
-  }
-  constexpr auto operator/=(const Dual &other) -> Dual & {
-    val /= other.val;
-    partials =
-      (other.val * partials - val * other.partials) / (other.val * other.val);
-    return *this;
-  }
-  constexpr auto operator+(double other) const -> Dual {
-    return {val + other, partials};
-  }
-  constexpr auto operator-(double other) const -> Dual {
-    return {val - other, partials};
-  }
-  constexpr auto operator*(double other) const -> Dual {
-    return {val * other, other * partials};
-  }
-  constexpr auto operator/(double other) const -> Dual {
-    return {val / other, partials / other};
-  }
-  constexpr auto operator+=(double other) -> Dual & {
-    val += other;
-    return *this;
-  }
-  constexpr auto operator-=(double other) -> Dual & {
-    val -= other;
-    return *this;
-  }
-  constexpr auto operator*=(double other) -> Dual & {
-    val *= other;
-    partials *= other;
-    return *this;
-  }
-  constexpr auto operator/=(double other) -> Dual & {
-    val /= other;
-    partials /= other;
-    return *this;
-  }
-  constexpr auto operator==(const Dual &other) const -> bool {
-    return val == other.val; // && grad == other.grad;
-  }
-  constexpr auto operator!=(const Dual &other) const -> bool {
-    return val != other.val; // || grad != other.grad;
-  }
-  constexpr auto operator==(double other) const -> bool { return val == other; }
-  constexpr auto operator!=(double other) const -> bool { return val != other; }
-  constexpr auto operator<(double other) const -> bool { return val < other; }
-  constexpr auto operator>(double other) const -> bool { return val > other; }
-  constexpr auto operator<=(double other) const -> bool { return val <= other; }
-  constexpr auto operator>=(double other) const -> bool { return val >= other; }
-  constexpr auto operator<(const Dual &other) const -> bool {
-    return val < other.val;
-  }
-  constexpr auto operator>(const Dual &other) const -> bool {
-    return val > other.val;
-  }
-  constexpr auto operator<=(const Dual &other) const -> bool {
-    return val <= other.val;
-  }
-  constexpr auto operator>=(const Dual &other) const -> bool {
-    return val >= other.val;
-  }
-};
-template <class T, size_t N> Dual(T, SVector<T, N>) -> Dual<T, N>;
-
-template <class T, size_t N>
-constexpr auto operator+(double other, Dual<T, N> x) -> Dual<T, N> {
-  return {x.value() + other, x.gradient()};
-}
-template <class T, size_t N>
-constexpr auto operator-(double other, Dual<T, N> x) -> Dual<T, N> {
-  return {x.value() - other, -x.gradient()};
-}
-template <class T, size_t N>
-constexpr auto operator*(double other, Dual<T, N> x) -> Dual<T, N> {
-  return {x.value() * other, other * x.gradient()};
-}
-template <class T, size_t N>
-constexpr auto operator/(double other, Dual<T, N> x) -> Dual<T, N> {
-  return {other / x.value(), -other * x.gradient() / (x.value() * x.value())};
-}
-static_assert(ElementOf<double, SquareMatrix<Dual<Dual<double, 4>, 2>>>);
 // auto x = Dual<Dual<double, 4>, 2>{1.0};
 // auto y = x * 3.4;
 
 static_assert(std::convertible_to<int, Dual<double, 4>>);
 static_assert(std::convertible_to<int, Dual<Dual<double, 4>, 2>>);
 
-template <class D> struct URand {
-  using T = typename D::val_type;
-  static constexpr size_t N = D::num_partials;
-  auto operator()(std::mt19937_64 &mt) -> D {
-    Dual<T, N> x{URand<T>{}(mt)};
-    for (size_t i = 0; i < N; ++i) x.gradient()[i] = URand<T>{}(mt);
+template <class T> struct URand {};
+
+template <class T, ptrdiff_t N> struct URand<Dual<T, N>> {
+  auto operator()(std::mt19937_64 &rng) -> Dual<T, N> {
+    Dual<T, N> x{URand<T>{}(rng)};
+    for (size_t i = 0; i < N; ++i) x.gradient()[i] = URand<T>{}(rng);
     return x;
   }
 };
 template <> struct URand<double> {
-  auto operator()(std::mt19937_64 &mt) -> double {
-    return std::uniform_real_distribution<double>(-2, 2)(mt);
+  auto operator()(std::mt19937_64 &rng) -> double {
+    return std::uniform_real_distribution<double>(-2, 2)(rng);
   }
 };
 
@@ -207,10 +82,10 @@ template <AbstractMatrix T> constexpr auto opnorm1(const T &A) {
   v.resizeForOverwrite(n);
   invariant(A.numRow() > 0);
   for (size_t j = 0; j < n; ++j)
-    v[j] = std::abs(extractDualValRecurse(A(0, j)));
+    v[j] = std::abs(extractDualValRecurse(A[0, j]));
   for (size_t i = 1; i < n; ++i)
     for (size_t j = 0; j < n; ++j)
-      v[j] += std::abs(extractDualValRecurse(A(i, j)));
+      v[j] += std::abs(extractDualValRecurse(A[i, j]));
   return *std::max_element(v.begin(), v.end());
 }
 
@@ -262,7 +137,7 @@ template <AbstractMatrix T> constexpr auto expm(const T &A) {
     *v += *u;
   }
   // return (V - U) \ (V + U);
-  LU::fact(std::move(A2)).ldiv(MutPtrMatrix<S>(V));
+  poly::math::LU::fact(std::move(A2)).ldiv(poly::math::MutPtrMatrix<S>(V));
   for (; s--;) {
     U = V * V;
     std::swap(U, V);
@@ -286,28 +161,28 @@ void expbench(const auto &A) {
 
 static void BM_expm(benchmark::State &state) {
   unsigned dim = state.range(0);
-  std::mt19937_64 mt(0);
+  std::mt19937_64 rng0;
   SquareMatrix<double> A{SquareDims{dim}};
-  for (auto &a : A) a = URand<double>{}(mt);
+  for (auto &a : A) a = URand<double>{}(rng0);
   for (auto b : state) expbench(A);
 }
 BENCHMARK(BM_expm)->DenseRange(2, 10, 1);
 static void BM_expm_dual4(benchmark::State &state) {
   unsigned dim = state.range(0);
-  std::mt19937_64 mt(0);
+  std::mt19937_64 rng0;
   using D = Dual<double, 4>;
   SquareMatrix<D> A{SquareDims{dim}};
-  for (auto &a : A) a = URand<D>{}(mt);
+  for (auto &a : A) a = URand<D>{}(rng0);
   for (auto b : state) expbench(A);
 }
 BENCHMARK(BM_expm_dual4)->DenseRange(2, 10, 1);
 
 static void BM_expm_dual4x2(benchmark::State &state) {
   unsigned dim = state.range(0);
-  std::mt19937_64 mt(0);
+  std::mt19937_64 rng0;
   using D = Dual<Dual<double, 4>, 2>;
   SquareMatrix<D> A{SquareDims{dim}};
-  for (auto &a : A) a = URand<D>{}(mt);
+  for (auto &a : A) a = URand<D>{}(rng0);
   for (auto b : state) expbench(A);
 }
 BENCHMARK(BM_expm_dual4x2)->DenseRange(2, 10, 1);
@@ -315,19 +190,19 @@ BENCHMARK(BM_expm_dual4x2)->DenseRange(2, 10, 1);
 using D4D2 = Dual<Dual<double, 4>, 2>;
 using SMDD = SquareMatrix<D4D2>;
 #ifdef __INTEL_LLVM_COMPILER
-using SMDD0 = math::ManagedArray<D4D2, SquareDims, 0>;
+using SMDD0 = poly::math::ManagedArray<D4D2, SquareDims, 0>;
 #else
-using SMDD0 = math::ManagedArray<D4D2, SquareDims>;
+using SMDD0 = poly::math::ManagedArray<D4D2, SquareDims>;
 #endif
 #pragma omp declare reduction(+ : SMDD0 : omp_out += omp_in)                   \
   initializer(omp_priv = SMDD0{omp_orig.dim(), D4D2{}})
 
 static void BM_expm_dual4x2_threads(benchmark::State &state) {
   unsigned dim = state.range(0);
-  std::mt19937_64 mt(0);
+  std::mt19937_64 rng0;
   using D = Dual<Dual<double, 4>, 2>;
   SquareMatrix<D> A{SquareDims{dim}};
-  for (auto &a : A) a = URand<D>{}(mt);
+  for (auto &a : A) a = URand<D>{}(rng0);
   for (auto bch : state) {
     SMDD0 B{SquareDims{dim}};
     B.fill(D{0});
diff --git a/benchmark/include/simplex_benchmark.hpp b/benchmark/include/simplex_benchmark.hpp
index 798ce6e94..a9da9843c 100644
--- a/benchmark/include/simplex_benchmark.hpp
+++ b/benchmark/include/simplex_benchmark.hpp
@@ -1,10 +1,12 @@
 #pragma once
-#include "Math/Simplex.hpp"
-#include "MatrixStringParse.hpp"
+#include <Math/Simplex.hpp>
+#include <Utilities/MatrixStringParse.hpp>
 #include <benchmark/benchmark.h>
 
+using poly::utils::operator""_mat, poly::math::_;
+
 static void BM_Simplex0(benchmark::State &state) {
-  math::DenseMatrix<int64_t> tableau{
+  poly::math::DenseMatrix<int64_t> tableau{
     "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
     "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
     "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 "
@@ -871,15 +873,16 @@ static void BM_Simplex0(benchmark::State &state) {
     "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 "
     "1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ]"_mat};
 
-  tableau(0, _) << -5859553999884210514;
-  OwningArena<> alloc;
+  tableau[0, _] << -5859553999884210514;
+  poly::alloc::OwningArena<> alloc;
   unsigned numCon = unsigned(tableau.numRow()) - 1;
   unsigned numVar = unsigned(tableau.numCol()) - 1;
-  NotNull<Simplex> simpBackup{Simplex::create(alloc, numCon, numVar, 0)};
+  poly::utils::Valid<poly::math::Simplex> simpBackup{
+    poly::math::Simplex::create(&alloc, numCon, numVar)};
   simpBackup->getTableau() << tableau;
   // Simplex simpBackup{tableau};
-  NotNull<Simplex> simp{Simplex::create(alloc, simpBackup->getNumCons(),
-                                        simpBackup->getNumVars(), 0)};
+  poly::utils::Valid<poly::math::Simplex> simp{poly::math::Simplex::create(
+    &alloc, simpBackup->getNumCons(), simpBackup->getNumVars())};
   // Vector<Rational> sol(37);
   for (auto b : state) {
     *simp << *simpBackup;
@@ -892,7 +895,7 @@ static void BM_Simplex0(benchmark::State &state) {
 BENCHMARK(BM_Simplex0);
 
 static void BM_Simplex1(benchmark::State &state) {
-  IntMatrix tableau{
+  poly::math::IntMatrix<> tableau{
     "[0 0 0 1 0 -1 0 0 0 0 0 0 0 0 0 -1 0 0 0 0 0 0 1 0 -1 0 0 725849473193 "
     "94205055327856 11 11 11 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 -1 0 0 0 0 0 "
     "0 0 0 0 0 0 0 0 0 0 0 1 0 -1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 "
@@ -1111,13 +1114,14 @@ static void BM_Simplex1(benchmark::State &state) {
     "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 "
     "0 0 0 0 0 0 0 0 0 ]"_mat};
 
-  OwningArena<> alloc;
+  poly::alloc::OwningArena<> alloc;
   unsigned numCon = unsigned(tableau.numRow()) - 1;
   unsigned numVar = unsigned(tableau.numCol()) - 1;
-  NotNull<Simplex> simpBackup{Simplex::create(alloc, numCon, numVar, 0)};
+  poly::utils::Valid<poly::math::Simplex> simpBackup{
+    poly::math::Simplex::create(&alloc, numCon, numVar, 0)};
   simpBackup->getTableau() << tableau;
-  NotNull<Simplex> simp{Simplex::create(alloc, simpBackup->getNumCons(),
-                                        simpBackup->getNumVars(), 0)};
+  poly::utils::Valid<poly::math::Simplex> simp{poly::math::Simplex::create(
+    &alloc, simpBackup->getNumCons(), simpBackup->getNumVars(), 0)};
   for (auto b : state) {
     *simp << *simpBackup;
     bool fail = simp->initiateFeasible();
diff --git a/benchmark/include/vector.hpp b/benchmark/include/vector.hpp
index ce94f2185..4337dfb86 100644
--- a/benchmark/include/vector.hpp
+++ b/benchmark/include/vector.hpp
@@ -1,6 +1,5 @@
 #pragma once
 #include "Math/Array.hpp"
-#include "Math/Vector.hpp"
 #include <benchmark/benchmark.h>
 #include <cstddef>
 #include <llvm/ADT/SmallVector.h>
diff --git a/compile_commands.json b/compile_commands.json
deleted file mode 100644
index 59630f355..000000000
--- a/compile_commands.json
+++ /dev/null
@@ -1,637 +0,0 @@
-[
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -fpch-instantiate-templates -Xclang -emit-pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -x c++-header -o CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.cxx",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.cxx",
-  "output": "CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -o CMakeFiles/LoopModelsTests.dir/bumpmap_test.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/bumpmap_test.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/bumpmap_test.cpp",
-  "output": "CMakeFiles/LoopModelsTests.dir/bumpmap_test.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -o CMakeFiles/LoopModelsTests.dir/comparator_test.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/comparator_test.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/comparator_test.cpp",
-  "output": "CMakeFiles/LoopModelsTests.dir/comparator_test.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -o CMakeFiles/LoopModelsTests.dir/compat_test.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/compat_test.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/compat_test.cpp",
-  "output": "CMakeFiles/LoopModelsTests.dir/compat_test.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -o CMakeFiles/LoopModelsTests.dir/dependence_test.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/dependence_test.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/dependence_test.cpp",
-  "output": "CMakeFiles/LoopModelsTests.dir/dependence_test.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -o CMakeFiles/LoopModelsTests.dir/graph_test.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/graph_test.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/graph_test.cpp",
-  "output": "CMakeFiles/LoopModelsTests.dir/graph_test.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -o CMakeFiles/LoopModelsTests.dir/orthogonalize_test.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/orthogonalize_test.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/orthogonalize_test.cpp",
-  "output": "CMakeFiles/LoopModelsTests.dir/orthogonalize_test.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -o CMakeFiles/LoopModelsTests.dir/remarks_test.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/remarks_test.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/remarks_test.cpp",
-  "output": "CMakeFiles/LoopModelsTests.dir/remarks_test.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -o CMakeFiles/gtest.dir/src/gtest-all.cc.o -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-all.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-all.cc",
-  "output": "_deps/googletest-build/googletest/CMakeFiles/gtest.dir/src/gtest-all.cc.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -o CMakeFiles/gtest_main.dir/src/gtest_main.cc.o -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest_main.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest_main.cc",
-  "output": "_deps/googletest-build/googletest/CMakeFiles/gtest_main.dir/src/gtest_main.cc.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -fpch-instantiate-templates -Xclang -emit-pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -x c++-header -o CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.cxx",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.cxx",
-  "output": "LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -o CMakeFiles/LoopModels.dir/lib/TurboLoop.cpp.o -c /home/chriselrod/Documents/progwork/cxx/LoopModels/lib/TurboLoop.cpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/lib/TurboLoop.cpp",
-  "output": "LoopModels/CMakeFiles/LoopModels.dir/lib/TurboLoop.cpp.o"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Dicts/BumpMapSet.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Dicts/BumpMapSet.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Dicts/BumpVector.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Dicts/BumpVector.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Array.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Array.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/Storage.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/Storage.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/ArrayOps.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/ArrayOps.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Indexing.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Indexing.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/AxisTypes.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/AxisTypes.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Invariant.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Invariant.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Iterators.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Iterators.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/MatrixDimensions.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/MatrixDimensions.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Matrix.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Matrix.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/TypePromotion.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/TypePromotion.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/UniformScaling.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/UniformScaling.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Vector.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Vector.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Rational.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Rational.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/GreatestCommonDivisor.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/GreatestCommonDivisor.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Allocators.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Allocators.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Valid.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Valid.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Optional.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/Optional.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include/ankerl/unordered_dense.h",
-  "file": "/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include/ankerl/unordered_dense.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Comparators.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Comparators.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Constraints.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Constraints.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/BitSets.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/BitSets.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Comparisons.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Comparisons.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/EmptyArrays.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/EmptyArrays.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Math.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Math.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/NormalForm.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/NormalForm.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Constructors.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Constructors.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/VectorGreatestCommonDivisor.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/VectorGreatestCommonDivisor.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Simplex.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Simplex.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/MatrixStringParse.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/MatrixStringParse.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Loops.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Loops.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Polyhedra.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Polyhedra.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/RemarkAnalysis.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/RemarkAnalysis.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/TestUtilities.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/TestUtilities.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/test/ArrayReference.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/test/ArrayReference.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Address.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Address.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/InstructionCost.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/InstructionCost.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Node.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Node.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/UnrolledList.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/UnrolledList.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Users.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Users.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/ListRanges.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Utilities/ListRanges.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Support/OStream.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Support/OStream.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test/../include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/test -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -isystem /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -std=gnu++20 -fno-exceptions -fno-rtti -ferror-limit=8 -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/CMakeFiles/LoopModelsTests.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Orthogonalize.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/Orthogonalize.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-assertion-result.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-assertion-result.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-message.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-message.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-port.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-port.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/custom/gtest-port.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/custom/gtest-port.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-port-arch.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-port-arch.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-death-test.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-death-test.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-death-test-internal.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-death-test-internal.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-matchers.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-matchers.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-printers.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-printers.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-internal.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-internal.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-filepath.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-filepath.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-string.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-string.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-type-util.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-type-util.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/custom/gtest-printers.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/custom/gtest-printers.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-param-test.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-param-test.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-param-util.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/gtest-param-util.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-test-part.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-test-part.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-typed-test.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-typed-test.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest_pred_impl.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest_pred_impl.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest_prod.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest_prod.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-assertion-result.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-assertion-result.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-death-test.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-death-test.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/custom/gtest.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/internal/custom/gtest.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-internal-inl.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-internal-inl.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-spi.h",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include/gtest/gtest-spi.h"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-filepath.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-filepath.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-matchers.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-matchers.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-port.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-port.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-printers.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-printers.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-test-part.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-test-part.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-typed-test.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest-typed-test.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/googletest-build/googletest",
-  "command": "/usr/bin/clang++ -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/include -I/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest -g -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wshadow -Wconversion -DGTEST_HAS_PTHREAD=1 -fexceptions -W -Wpointer-arith -Wreturn-type -Wcast-qual -Wwrite-strings -Wswitch -Wunused-parameter -Wcast-align -Wchar-subscripts -Winline -Wredundant-decls -c /home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest.cc",
-  "file": "/home/chriselrod/.cache/CPM/googletest/96129d89f45386492ae46d6bb8c027bc3df5f949/googletest/src/gtest.cc"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/TurboLoop.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/TurboLoop.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Cache.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Cache.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/BBPredPath.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/BBPredPath.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Dicts/MapVector.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Dicts/MapVector.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Predicate.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Predicate.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/TinyVector.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Containers/TinyVector.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Instruction.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/Instruction.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/CostModeling.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/IR/CostModeling.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Graphs/Graphs.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Graphs/Graphs.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/LinearProgramming/LoopBlock.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/LinearProgramming/LoopBlock.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/LinearProgramming/ScheduledNode.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/LinearProgramming/ScheduledNode.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Dependence.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Dependence.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/DependencyPolyhedra.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/DependencyPolyhedra.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Schedule.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/include/Polyhedra/Schedule.hpp"
-},
-
-{
-  "directory": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels",
-  "command": "/usr/bin/clang++ -DLoopModels_EXPORTS -I/home/chriselrod/Documents/progwork/cxx/LoopModels/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/PackageProjectInclude -I/home/chriselrod/.cache/CPM/unordered_dense/01b84887a44155645ad5caf7efee7b8e4141e179/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include -I/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-build/PackageProjectInclude -g -std=gnu++20 -fPIC -fvisibility=hidden -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -fstrict-aliasing -fno-plt -fstrict-overflow -fno-omit-frame-pointer -fcolor-diagnostics -Wall -Wpedantic -Wextra -Wshadow -D_GLIBCXX_ASSERTIONS -Winvalid-pch -Xclang -include-pch -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx.gch -Xclang -include -Xclang /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/LoopModels/CMakeFiles/LoopModels.dir/cmake_pch.hxx -c /home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/StaticArrays.hpp",
-  "file": "/home/chriselrod/Documents/progwork/cxx/LoopModels/buildclang/nosan/_deps/math-src/include/Math/StaticArrays.hpp"
-}
-]
diff --git a/include/Dicts/BumpMapSet.hpp b/include/Dicts/BumpMapSet.hpp
index 0a2b836e9..e88247d5e 100644
--- a/include/Dicts/BumpMapSet.hpp
+++ b/include/Dicts/BumpMapSet.hpp
@@ -1,6 +1,7 @@
 #pragma once
+#include <Alloc/Arena.hpp>
 #include <Dicts/BumpVector.hpp>
-#include <Utilities/Allocators.hpp>
+#include <IR/Hash.hpp>
 #include <ankerl/unordered_dense.h>
 
 namespace poly::dict {
@@ -10,7 +11,7 @@ template <typename K, typename V>
 using map = ankerl::unordered_dense::map<K, V>;
 
 template <typename K, typename V>
-struct amap
+struct amap // NOLINT(readability-identifier-naming)
   : ankerl::unordered_dense::map<K, V, ankerl::unordered_dense::hash<K>,
                                  std::equal_to<K>,
                                  math::BumpPtrVector<std::pair<K, V>>> {
@@ -21,7 +22,7 @@ struct amap
   amap(Arena<> *alloc) : Base{WArena<std::pair<K, V>>(alloc)} {}
 };
 template <typename K>
-struct aset
+struct aset // NOLINT(readability-identifier-naming)
   : ankerl::unordered_dense::set<K, ankerl::unordered_dense::hash<K>,
                                  std::equal_to<K>, math::BumpPtrVector<K>> {
   using Base =
@@ -29,4 +30,10 @@ struct aset
                                  std::equal_to<K>, math::BumpPtrVector<K>>;
   aset(Arena<> *alloc) : Base{WArena<K>(alloc)} {}
 };
+
+static_assert(std::same_as<amap<int, int>::value_container_type,
+                           math::BumpPtrVector<std::pair<int, int>>>);
+static_assert(std::same_as<amap<int, int>::allocator_type,
+                           alloc::WArena<std::pair<int, int>, 16384, true>>);
+
 } // namespace poly::dict
diff --git a/include/Dicts/BumpVector.hpp b/include/Dicts/BumpVector.hpp
index bcc22821f..c34807f26 100644
--- a/include/Dicts/BumpVector.hpp
+++ b/include/Dicts/BumpVector.hpp
@@ -1,13 +1,13 @@
 #pragma once
+#include <Alloc/Arena.hpp>
 #include <Math/Array.hpp>
 #include <Math/Indexing.hpp>
-#include <Utilities/Allocators.hpp>
 #include <cstdint>
 
 // In include/Dicts, as it primarily serves to support amap/aset
 
 namespace poly {
-using utils::WArena, utils::Arena;
+using alloc::WArena, alloc::Arena;
 } // namespace poly
 
 namespace poly::math {
@@ -31,7 +31,7 @@ template <typename T, unsigned InitialCapacity = 8> struct BumpPtrVector {
   [[no_unique_address]] T *mem;
   [[no_unique_address]] unsigned Size;
   [[no_unique_address]] unsigned Capacity;
-  [[no_unique_address]] NotNull<Arena<>> Alloc;
+  [[no_unique_address]] Valid<Arena<>> Alloc;
 
   constexpr BumpPtrVector(WArena<T> a)
     : mem(a.allocate(InitialCapacity)), Size(0), Capacity(InitialCapacity),
@@ -90,19 +90,19 @@ template <typename T, unsigned InitialCapacity = 8> struct BumpPtrVector {
     return mem[canonicalize(i, Size)];
   }
   [[nodiscard]] constexpr auto front() -> T & {
-    assert(Size > 0);
+    invariant(Size > 0);
     return mem[0];
   }
   [[nodiscard]] constexpr auto back() -> T & {
-    assert(Size > 0);
+    invariant(Size > 0);
     return mem[Size - 1];
   }
   [[nodiscard]] constexpr auto front() const -> const T & {
-    assert(Size > 0);
+    invariant(Size > 0);
     return mem[0];
   }
   [[nodiscard]] constexpr auto back() const -> const T & {
-    assert(Size > 0);
+    invariant(Size > 0);
     return mem[Size - 1];
   }
   [[nodiscard]] constexpr auto isEmpty() const -> bool { return Size == 0; }
@@ -114,13 +114,13 @@ template <typename T, unsigned InitialCapacity = 8> struct BumpPtrVector {
   //   : mem(x.data()), N(x.size()) {}
   // constexpr MutPtrVector(T *pt, size_t NN) : mem(pt), N(NN) {}
   constexpr auto operator[](Range<size_t, size_t> i) -> MutPtrVector<T> {
-    assert(i.b <= i.e);
-    assert(i.e <= Size);
+    invariant(i.b <= i.e);
+    invariant(i.e <= Size);
     return MutPtrVector<T>{mem + i.b, i.e - i.b};
   }
   constexpr auto operator[](Range<size_t, size_t> i) const -> PtrVector<T> {
-    assert(i.b <= i.e);
-    assert(i.e <= Size);
+    invariant(i.b <= i.e);
+    invariant(i.e <= Size);
     return PtrVector<T>{mem + i.b, i.e - i.b};
   }
   template <typename F, typename L>
@@ -228,7 +228,7 @@ template <typename T, unsigned InitialCapacity = 8> struct BumpPtrVector {
     Capacity = N;
   }
   constexpr void truncate(size_t N) {
-    assert(N <= Capacity);
+    invariant(N <= Capacity);
     Size = N;
   }
   constexpr void resize(size_t N) {
@@ -267,7 +267,7 @@ template <typename T, unsigned InitialCapacity = 8> struct BumpPtrVector {
   [[nodiscard]] constexpr auto empty() const -> bool { return Size == 0; }
   constexpr void pop_back() { --Size; }
   constexpr void erase(T *x) {
-    assert(x >= mem && x < mem + Size);
+    invariant(x >= mem && x < mem + Size);
     std::destroy_at(x);
     std::copy_n(x + 1, Size, x);
     --Size;
diff --git a/include/Dicts/MapVector.hpp b/include/Dicts/MapVector.hpp
index 4dc6eec27..4b375e19d 100644
--- a/include/Dicts/MapVector.hpp
+++ b/include/Dicts/MapVector.hpp
@@ -7,8 +7,8 @@ namespace poly::dict {
 
 template <class K, class V> class OrderedMap {
   amap<K, size_t> map;
-  // math::BumpPtrVector<std::pair<K, V>> vector;
-  math::ResizeableView<std::pair<K, V>, unsigned> vector;
+  // math::BumpPtrVector<containers::Pair<K, V>> vector;
+  math::ResizeableView<containers::Pair<K, V>, unsigned> vector;
 
 public:
   constexpr OrderedMap(Arena<> *alloc) : map(alloc), vector() {}
@@ -64,10 +64,10 @@ template <class K, class V> class OrderedMap {
   }
   constexpr void grow(unsigned i) {
     if (i == vector.getCapacity())
-      vector.reserve(*(map.get_allocator().get_allocator()),
+      vector.reserve((map.get_allocator().get_allocator()),
                      std::max<unsigned>(8, 2 * i));
   }
-  constexpr void insert(std::pair<K, V> &&value) {
+  constexpr void insert(containers::Pair<K, V> &&value) {
     insert(std::move(value.first), std::move(value.second));
   }
   constexpr void clear() {
diff --git a/include/Dicts/Trie.hpp b/include/Dicts/Trie.hpp
new file mode 100644
index 000000000..8c3f2e1f8
--- /dev/null
+++ b/include/Dicts/Trie.hpp
@@ -0,0 +1,229 @@
+#pragma once
+#include "Containers/Pair.hpp"
+#include <Alloc/Arena.hpp>
+#include <Utilities/Invariant.hpp>
+#include <Utilities/Optional.hpp>
+#include <Utilities/Valid.hpp>
+#include <ankerl/unordered_dense.h>
+#include <cstdint>
+
+namespace poly::dict {
+using utils::invariant, containers::Pair;
+
+template <class T> constexpr auto fastHash(const T &x) -> uint64_t {
+  return ankerl::unordered_dense::hash<T>{}(x);
+}
+template <class T> constexpr auto fastHash(T *x) -> uint64_t {
+  return reinterpret_cast<uintptr_t>(x) >>
+         std::countr_zero(alignof(std::max_align_t));
+}
+
+// Idea from from https://nullprogram.com/blog/2023/09/30/
+template <class K, class V> struct TrieMapNode {
+  K first;
+  V second{};
+  std::array<TrieMapNode<K, V> *, 4> children{};
+
+  constexpr auto find(const K &k) -> TrieMapNode * {
+    return findChild(k).child;
+  }
+
+protected:
+  struct Child {
+    TrieMapNode *child;
+    TrieMapNode *parent;
+    uint64_t index; // child == parent->children[index];
+  };
+  constexpr auto isLeaf() -> bool {
+    return first && !std::ranges::any_of(children);
+  }
+  constexpr auto getLeaf() -> Child {
+    if (!first) return {nullptr, nullptr, 0};
+    for (size_t i = 0; i < std::size(children); ++i)
+      if (TrieMapNode *child = children[i])
+        if (Child leaf = child->getLeaf(); leaf.child)
+          return leaf.parent ? leaf : Child{leaf.child, this, i};
+    return {this, nullptr, 0};
+  }
+  constexpr auto getSubLeaf() -> Child {
+    Child c = getLeaf();
+    return c.child != this ? c : Child{nullptr, nullptr, 0};
+  }
+  auto findChild(const K &k) -> Child {
+    if (k == first) return {this, nullptr, 0};
+    TrieMapNode *p = this, *c = nullptr;
+    for (uint64_t h = fastHash(k);; h >>= 2) {
+      c = p->children[h & 3];
+      if (!c || (c->first == k)) return {c, p, h & 3};
+      p = c;
+    }
+  }
+  // Returns the removed node
+  auto eraseImpl(const K &k) -> TrieMapNode * {
+    Child child = findChild(k);
+    if (!child.child) return nullptr;
+    // we're erasing `child`
+    Child l = child.child->getSubLeaf();
+    if (l.child) {
+      l.parent->children[l.index] = nullptr; // leaf is moved up
+      std::swap(l.child->children, child.child->children);
+    }
+    child.parent->children[child.index] = l.child; // leaf replaces deleted
+    child.child->second = {};
+    return child.child;
+  }
+};
+
+// If `EfficientErase = true`, it stores a list of erased nodes.
+// Future allocations will allocate from this list if possible.
+// Thus, whenever using a pattern that involves interleaving erase and
+// insertions, it is worth setting `EfficientErase = true`. It is common enough
+// not to do this, that the option for `false` also exists. Don't pay for what
+// you don't use.
+template <bool EfficientErase, class K, class V>
+struct TrieMap : TrieMapNode<K, V> {
+  using NodeT = TrieMapNode<K, V>;
+  NodeT *list{nullptr};
+  // TODO: implement using `list` to avoid allocs
+  void erase(const K &k) {
+    if (NodeT *erased = this->eraseImpl(k))
+      erased->children[0] = std::exchange(list, erased);
+  }
+  auto operator[](utils::Valid<alloc::Arena<>> alloc, const K &k) -> V & {
+    typename NodeT::Child c = this->findChild(k);
+    if (c.child) return c.child->second;
+    invariant(c.parent != nullptr);
+    invariant(c.index < 4);
+    NodeT *&res = c.parent->children[c.index];
+    invariant(res == nullptr);
+    if (list) {
+      res = list;
+      list = std::exchange(list->children[0], nullptr);
+      res->second = {};
+    } else {
+      res = alloc->create<NodeT>();
+      invariant(res->second == V{});
+    }
+    res->first = k;
+    return res->second;
+  }
+};
+
+template <class K, class V> struct TrieMap<false, K, V> : TrieMapNode<K, V> {
+  using NodeT = TrieMapNode<K, V>;
+  void erase(const K &k) { this->eraseImpl(k); }
+  auto operator[](utils::Valid<alloc::Arena<>> alloc, const K &k) -> V & {
+    typename NodeT::Child c = findChild(k);
+    if (c.child) return c.child->second;
+    invariant(c.parent != nullptr);
+    invariant(c.index < 4);
+    invariant(c.parent->children[c.index] == nullptr);
+    TrieMapNode res = c.parent->children[c.index] = alloc->create<NodeT>();
+    res->first = k;
+    return res->second;
+  }
+};
+
+static_assert(sizeof(TrieMap<false, int, int>) ==
+              sizeof(TrieMapNode<int, int>));
+static_assert(sizeof(TrieMap<true, int, int>) ==
+              sizeof(TrieMapNode<int, int>) + sizeof(TrieMapNode<int, int> *));
+
+// Optional can be specialized for types to add dead-values without requiring
+// extra space. E.g., `sizeof(utils::Optional<T*>) == sizeof(T*)`, as `nullptr`
+// indicates empty.
+template <class K, class V> struct InlineTrie {
+  InlineTrie<K, V> *children[4]{};
+  utils::Optional<K> keys[4]{};
+  V values[4]{};
+
+  // Returns an optional pointer to the value.
+  constexpr auto find(const K &k) -> utils::Optional<V &> {
+    auto [node, index] = findChild<false>(this, k);
+    return node ? utils::Optional<V &>{node->values[index]} : std::nullopt;
+  }
+
+  auto operator[](utils::Valid<alloc::Arena<>> alloc, const K &k) -> V & {
+    Child c = findChild<true>(this, k);
+    if (c.subIndex) {
+      c.node = c.node->children[*c.subIndex] =
+        alloc->create<InlineTrie<K, V>>();
+      c.node->keys[c.index] = k;
+    }
+    return c.node->values[c.index];
+  }
+
+  void erase(const K &k) {
+    auto [child, index] = findChild<false>(this, k);
+    if (!child) return; // was not found
+    // We now find a leaf key/value pair, and move them here.
+    if (InlineTrie *descendent = child->children[index]) {
+      auto [lc, li] = descendent->findLeaf();
+      if (lc) {
+        child->keys[index] = std::move(lc->keys[li]);
+        child->values[index] = std::move(lc->values[li]);
+        child = lc;
+        index = li;
+      }
+    }
+    child->keys[index] = {}; // set to null
+    child->values[index] = {};
+  }
+
+private:
+  auto isLeaf(int i) -> bool {
+    if (!keys[i]) return false;
+    if (!children[i]) return true;
+    for (int j = 0; j < 4; ++j)
+      if (!children[i]->isLeaf(j)) return false;
+    return true;
+  }
+  // A leaf is a key without any child keys.
+  // A leaf may have children without keys.
+  auto findLeaf() -> Pair<InlineTrie *, ptrdiff_t> {
+    InlineTrie *leaf = this;
+    bool descend[4]{false, false, false, false};
+    for (ptrdiff_t i = 0; i < std::ssize(children); ++i) {
+      if (!leaf->keys[i]) continue;             // need key to be leaf
+      if (!leaf->children[i]) return {leaf, i}; // no children, no child keys
+      descend[i] = true;
+    }
+    for (ptrdiff_t i = 0; i < std::ssize(children); ++i) {
+      if (!descend[i]) continue;
+      auto ret = leaf->children[i]->findLeaf();
+      return ret.first ? ret : Pair<InlineTrie *, ptrdiff_t>{this, i};
+    };
+    return {nullptr, 0};
+  }
+  struct Child {
+    InlineTrie *node;
+    size_t index;
+    utils::Optional<size_t> subIndex;
+  };
+
+  template <bool Insert>
+  static constexpr auto findChild(InlineTrie *node, const K &k) {
+    for (uint64_t h = fastHash(k);;) {
+      uint64_t ind = h & 3;
+      bool noKey = !node->keys[ind];
+      if constexpr (Insert) {
+        if (noKey) node->keys[ind] = k;
+        if (noKey || (*node->keys[ind] == k)) return Child{node, ind, {}};
+      } else {
+        if (noKey) return Pair<InlineTrie *, uint64_t>{nullptr, ind};
+        if (*node->keys[ind] == k)
+          return Pair<InlineTrie *, uint64_t>{node, ind};
+      }
+      h >>= 2;
+      if (!node->children[ind]) {
+        if constexpr (Insert) return Child{node, h & 3, ind};
+        else return Pair<InlineTrie *, uint64_t>{nullptr, ind};
+      }
+      node = node->children[ind];
+    }
+  };
+};
+
+// static_assert(sizeof(std::array<TrieMapNode<int,int>*,0 >)==1);
+
+} // namespace poly::dict
diff --git a/include/Graphs/Bipartite.hpp b/include/Graphs/Bipartite.hpp
index 059b8c9f6..8cdeaba6d 100644
--- a/include/Graphs/Bipartite.hpp
+++ b/include/Graphs/Bipartite.hpp
@@ -31,7 +31,7 @@ inline auto bipartiteMatch(Matrix<bool> &bpGraph, int u,
 /// Returns maximum number
 /// of matching from M to N
 inline auto maxBipartiteMatch(Matrix<bool> &bpGraph)
-  -> std::pair<size_t, Vector<int>> {
+  -> containers::Pair<size_t, Vector<int>> {
   // An array to keep track of the
   // applicants assigned to jobs.
   // The value of matchR[i] is the
@@ -39,7 +39,7 @@ inline auto maxBipartiteMatch(Matrix<bool> &bpGraph)
   // the value -1 indicates nobody is
   // assigned.
   auto [N, M] = bpGraph.size();
-  std::pair<size_t, Vector<int>> res{0, {unsigned(N), -1}};
+  containers::Pair<size_t, Vector<int>> res{0, {unsigned(N), -1}};
   size_t &result = res.first;
   Vector<int> &matchR{res.second};
   if (M) {
diff --git a/include/Graphs/Graphs.hpp b/include/Graphs/Graphs.hpp
index 32f621b8a..afdb1a33f 100644
--- a/include/Graphs/Graphs.hpp
+++ b/include/Graphs/Graphs.hpp
@@ -1,6 +1,6 @@
 #pragma once
+#include <Alloc/Arena.hpp>
 #include <Math/Array.hpp>
-#include <Utilities/Allocators.hpp>
 namespace poly::graph {
 
 // Currently, only implements top sort, and Tarjan's strongly connected
@@ -34,13 +34,9 @@ namespace poly::graph {
 //
 template <typename G>
 concept AbstractPtrGraph = requires(G g, typename G::VertexType *v) {
-  {
-    *(g.getVertices(v).begin())
-  } -> std::template same_as<typename G::VertexType *>;
+  { *(g.getVertices(v).begin()) } -> std::same_as<typename G::VertexType *>;
   { g.getVertices(v) } -> std::ranges::forward_range;
-  {
-    *(g.outNeighbors(v).begin())
-  } -> std::template same_as<typename G::VertexType *>;
+  { *(g.outNeighbors(v).begin()) } -> std::same_as<typename G::VertexType *>;
   { g.outNeighbors(v) } -> std::ranges::forward_range;
   { v->index() } -> std::assignable_from<unsigned>;
   { v->lowLink() } -> std::assignable_from<unsigned>;
@@ -50,10 +46,10 @@ concept AbstractPtrGraph = requires(G g, typename G::VertexType *v) {
   { v->visited() } -> std::same_as<bool>;
   { v->visit() };
   { v->unVisit() };
-  { v->setNext(v) } -> std::template same_as<typename G::VertexType *>;
-  { v->getNext() } -> std::template same_as<typename G::VertexType *>;
-  { v->setNextComponent(v) } -> std::template same_as<typename G::VertexType *>;
-  { v->getNextComponent() } -> std::template same_as<typename G::VertexType *>;
+  { v->setNext(v) } -> std::same_as<typename G::VertexType *>;
+  { v->getNext() } -> std::same_as<typename G::VertexType *>;
+  { v->setNextComponent(v) } -> std::same_as<typename G::VertexType *>;
+  { v->getNextComponent() } -> std::same_as<typename G::VertexType *>;
 };
 
 template <class N> struct State {
@@ -99,8 +95,8 @@ template <AbstractPtrGraph G>
 inline auto stronglyConnectedComponents(G g, vertex_t<G> *seed)
   -> vertex_t<G> * {
   using N = vertex_t<G>;
-  State<N *> state{};
-  for (auto *v : g->getVertices(seed))
+  State<N> state{};
+  for (auto *v : g.getVertices(seed))
     if (!v->wasVisited()) state = strongConnect(g, state, v);
   return state.components;
 }
diff --git a/include/IR/Address.hpp b/include/IR/Address.hpp
index 6a07806a1..862c49068 100644
--- a/include/IR/Address.hpp
+++ b/include/IR/Address.hpp
@@ -2,35 +2,39 @@
 
 #include "IR/InstructionCost.hpp"
 #include "IR/Node.hpp"
+#include "IR/OrthogonalAxes.hpp"
 #include "IR/Users.hpp"
 #include "Polyhedra/Loops.hpp"
 #include "Support/OStream.hpp"
 #include "Utilities/ListRanges.hpp"
+#include <Alloc/Arena.hpp>
 #include <Containers/UnrolledList.hpp>
 #include <Math/Array.hpp>
 #include <Math/Comparisons.hpp>
 #include <Math/Math.hpp>
-#include <Utilities/Allocators.hpp>
 #include <Utilities/Valid.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/PatternMatch.h>
 #include <llvm/Support/Casting.h>
+#include <llvm/Support/InstructionCost.h>
 
 namespace poly {
 namespace lp {
 class ScheduledNode;
 } // namespace lp
 namespace poly {
-class Dependence;
+struct Dependence;
 class Dependencies;
 } // namespace poly
 namespace IR {
 using math::PtrVector, math::MutPtrVector, math::DensePtrMatrix,
   math::MutDensePtrMatrix, math::SquarePtrMatrix, math::_, math::DenseDims,
-  math::PtrMatrix, math::end, poly::Dependence, poly::Dependencies;
+  math::PtrMatrix, math::end, poly::Dependence, poly::Dependencies,
+  utils::ListRange;
 
 /// Represents a memory access that has been rotated according to some affine
 /// transform.
@@ -78,14 +82,22 @@ class Addr : public Instruction {
   int32_t edgeIn{-1};
   int32_t edgeOut{-1};
   lp::ScheduledNode *node;
-  NotNull<const llvm::SCEVUnknown> basePointer;
+  Valid<const llvm::SCEVUnknown> basePointer;
   poly::Loop *loop{nullptr};
   llvm::Instruction *instr;
   int64_t *offSym{nullptr};
   const llvm::SCEV **syms;
   Value *predicate{nullptr};
   Addr *origNext{nullptr};
-  unsigned numDim{0}, numDynSym{0};
+  /// We find reductionns during `IROptimizer` initialization
+  /// after sorting edges and removing redundant `Addr`
+  /// this is because we may have multiple repeat stores to the the same
+  /// location, and a reduction would be the closest pair. Thus, we want to have
+  /// an ordering.
+  Addr *reassociableReduction{nullptr}; // if reduction, corresponding addr
+  uint16_t numDim{0}, numDynSym{0};
+  int32_t topologicalPosition;
+  OrthogonalAxes axes;
 #if !defined(__clang__) && defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpedantic"
@@ -107,12 +119,6 @@ class Addr : public Instruction {
                   numLoops, natDepth, maxNumLoops),
       basePointer(arrayPtr), instr(user), offSym(offsym), syms(s),
       numDim(dimOff[0]), numDynSym(dimOff[1]){};
-  explicit Addr(const llvm::SCEVUnknown *arrayPtr, llvm::Instruction *user,
-                unsigned numLoops)
-    : Instruction(llvm::isa<llvm::StoreInst>(user) ? VK_Stow : VK_Load,
-                  numLoops),
-      basePointer(arrayPtr), instr(user){};
-  /// Constructor for 0 dimensional memory access
 
   [[nodiscard]] constexpr auto getIntMemory() -> int64_t * { return mem; }
   [[nodiscard]] constexpr auto getIntMemory() const -> int64_t * {
@@ -127,15 +133,63 @@ class Addr : public Instruction {
   [[nodiscard]] constexpr auto indMatPtr() const -> int64_t * {
     return getIntMemory() + 1 + getArrayDim();
   }
-  [[nodiscard]] auto getSymbolicOffsets() -> MutPtrVector<const llvm::SCEV *> {
+  [[nodiscard]] constexpr auto getSymbolicOffsets()
+    -> MutPtrVector<const llvm::SCEV *> {
     return {syms + numDim, numDynSym};
   }
   [[nodiscard]] constexpr auto offsetMatrix() -> MutDensePtrMatrix<int64_t> {
-    return {offSym, DenseDims{getArrayDim(), numDynSym}};
+    return {offSym, DenseDims<>{{getArrayDim()}, {numDynSym}}};
   }
+  /// recursive reassociability search
 
 public:
-  constexpr void rotate(NotNull<poly::Loop> explicitLoop,
+  [[nodiscard]] constexpr auto getOrthAxes() const -> OrthogonalAxes {
+    return axes;
+  }
+  constexpr auto calcOrthAxes(ptrdiff_t depth) -> OrthogonalAxes {
+    invariant((depth <= 24) && (depth >= 0));
+    invariant(depth >= naturalDepth);
+    invariant(currentDepth >= depth);
+    currentDepth = depth;
+    bool indepAxes = true;
+    uint32_t contig{0}, indep{(uint32_t(1) << depth) - 1};
+    /// indexMatrix() -> arrayDim() x getNumLoops()
+    DensePtrMatrix<int64_t> inds{indexMatrix()};
+    for (ptrdiff_t l = 0; l < inds.numCol(); ++l) {
+      if (!inds[0, l]) continue;
+      contig |= uint32_t(1) << l;
+      indep &= ~(uint32_t(1) << l);
+    }
+    for (ptrdiff_t d = 1; d < inds.numRow(); ++d) {
+      for (ptrdiff_t l = 0; l < inds.numCol(); ++l) {
+        if (!inds[d, l]) continue;
+        if (!(indep & (uint32_t(1) << l))) indepAxes = false;
+        indep &= ~(uint32_t(1) << l);
+      }
+    }
+    axes = {indepAxes, contig, indep};
+    return axes;
+  }
+  [[nodiscard]] constexpr auto isDropped() const -> bool {
+    return (getNext() == nullptr) && (getPrev() == nullptr);
+  }
+  constexpr void setTopPosition(int32_t pos) { topologicalPosition = pos; }
+  [[nodiscard]] constexpr auto getTopPosition() const -> int32_t {
+    return topologicalPosition;
+  }
+
+  /// Constructor for 0 dimensional memory access
+  /// public for use with `std::construct_at`
+  /// Perhaps it should use a passkey?
+  explicit Addr(const llvm::SCEVUnknown *arrayPtr, llvm::Instruction *user,
+                unsigned numLoops)
+    : Instruction(llvm::isa<llvm::StoreInst>(user) ? VK_Stow : VK_Load,
+                  numLoops),
+      basePointer(arrayPtr), instr(user){};
+
+  /// This gets called to rotate so that we can make direct comparisons down the
+  /// road without needing rotations.
+  constexpr void rotate(Valid<poly::Loop> explicitLoop,
                         SquarePtrMatrix<int64_t> Pinv, int64_t denom,
                         PtrVector<int64_t> omega, int64_t *offsets) {
     loop = explicitLoop;
@@ -143,7 +197,7 @@ class Addr : public Instruction {
     unsigned oldNatDepth = getNaturalDepth();
     DensePtrMatrix<int64_t> M{indexMatrix()}; // aD x nLma
     MutPtrVector<int64_t> offsetOmega{getOffsetOmega()};
-    unsigned depth = this->naturalDepth = uint8_t(Pinv.numCol());
+    unsigned depth = uint8_t(ptrdiff_t(Pinv.numCol()));
     MutDensePtrMatrix<int64_t> mStar{indexMatrix()};
     // M is implicitly padded with zeros, newNumLoops >= oldNumLoops
     invariant(maxDepth >= naturalDepth);
@@ -159,32 +213,32 @@ class Addr : public Instruction {
     // as a temporary, to avoid the aliasing problem.
     //
     // Use `M` before updating it, to update `offsetOmega`
-    if (offsets) offsetOmega -= M * PtrVector<int64_t>{offsets, oldNatDepth};
+    if (offsets)
+      offsetOmega -= PtrVector<int64_t>{offsets, oldNatDepth} * M.t();
     // update `M` into `mStar`
     // mStar << M * Pinv(_(0, oldNumLoops), _);
     MutPtrVector<int64_t> buff{getFusionOmega()[_(0, math::last)]};
-    invariant(buff.size(), unsigned(depth));
+    invariant(buff.size(), ptrdiff_t(depth));
     unsigned newNatDepth = 0;
     for (ptrdiff_t d = getArrayDim(); d--;) {
       buff << 0;
-      for (ptrdiff_t k = 0; k < oldNatDepth; ++k) buff += M(d, k) * Pinv(k, _);
-      mStar(d, _) << buff;
+      for (ptrdiff_t k = 0; k < oldNatDepth; ++k) buff += M[d, k] * Pinv[k, _];
+      mStar[d, _] << buff;
       if (newNatDepth == depth) continue;
-      // find last
+      // find last, as buf goes outer<->inner
       auto range = std::ranges::reverse_view{buff[_(newNatDepth, depth)]};
       auto m = std::ranges::find_if(range, [](int64_t i) { return i != 0; });
       if (m == range.end()) continue;
       newNatDepth = depth - std::distance(range.begin(), m);
     }
     // use `mStar` to update offsetOmega`
-    offsetOmega -= mStar * omega;
+    offsetOmega -= omega * mStar.t();
+    this->naturalDepth = newNatDepth;
     if (newNatDepth == depth) return;
     invariant(newNatDepth < depth);
-    this->naturalDepth = newNatDepth;
     MutDensePtrMatrix<int64_t> indMat{this->indexMatrix()};
     for (ptrdiff_t d = 1; d < getArrayDim(); ++d)
-      indMat(d, _) << mStar(d, _(0, newNatDepth));
-    this->naturalDepth = newNatDepth;
+      indMat[d, _] << mStar[d, _(0, newNatDepth)];
   }
   // NOTE: this requires `nodeOrDepth` to be set to innmost loop depth
   [[nodiscard]] constexpr auto indexedByInnermostLoop() -> bool {
@@ -193,7 +247,7 @@ class Addr : public Instruction {
     return ret;
   }
   [[nodiscard]] constexpr auto eachAddr() {
-    return utils::ListRange{this, [](Addr *a) { return a->getNextAddr(); }};
+    return ListRange{this, [](Addr *a) -> Addr * { return a->getNextAddr(); }};
   }
   constexpr auto getNextAddr() -> Addr * { return origNext; }
   [[nodiscard]] constexpr auto getNextAddr() const -> const Addr * {
@@ -208,6 +262,17 @@ class Addr : public Instruction {
     origNext = a;
     return this;
   }
+  // Called from IROptimizer
+  // In a reduction, `in` must be a load and `out` a store
+  // This should only be called once, between nearest load/store pair
+  // as it doesn't store detecting invalidity.
+  // It checks for invalidity, in which case it doesn't set the reassociable
+  // reduction.
+  constexpr inline void maybeReassociableReduction(const Dependencies &);
+  constexpr auto reassociableReductionPair() -> Addr * {
+    return reassociableReduction;
+  }
+
   [[nodiscard]] static constexpr auto intMemNeeded(size_t numLoops, size_t dim)
     -> size_t {
     // d = dim, l = numLoops
@@ -243,26 +308,37 @@ class Addr : public Instruction {
     return node;
   }
   constexpr void setNode(lp::ScheduledNode *n) { node = n; }
-  [[nodiscard]] inline auto inputAddrs(Dependencies) const;
-  [[nodiscard]] inline auto outputAddrs(Dependencies) const;
-  [[nodiscard]] inline auto inputAddrs(Dependencies, unsigned depth) const;
-  [[nodiscard]] inline auto outputAddrs(Dependencies, unsigned depth) const;
-  [[nodiscard]] inline auto inputEdges(Dependencies) const;
-  [[nodiscard]] inline auto outputEdges(Dependencies) const;
-  [[nodiscard]] inline auto inputEdges(Dependencies, unsigned depth) const;
-  [[nodiscard]] inline auto outputEdges(Dependencies, unsigned depth) const;
-  [[nodiscard]] inline auto inputEdgeIDs(Dependencies) const;
-  [[nodiscard]] inline auto outputEdgeIDs(Dependencies) const;
-  [[nodiscard]] inline auto inputEdgeIDs(Dependencies, unsigned depth) const;
-  [[nodiscard]] inline auto outputEdgeIDs(Dependencies, unsigned depth) const;
+  [[nodiscard]] inline auto inputAddrs(const Dependencies &) const;
+  [[nodiscard]] inline auto outputAddrs(const Dependencies &) const;
+  [[nodiscard]] inline auto inputAddrs(const Dependencies &, int depth) const;
+  [[nodiscard]] inline auto outputAddrs(const Dependencies &, int depth) const;
+  [[nodiscard]] inline auto inputEdges(const Dependencies &) const;
+  [[nodiscard]] inline auto outputEdges(const Dependencies &) const;
+  [[nodiscard]] inline auto inputEdges(const Dependencies &, int depth) const;
+  [[nodiscard]] inline auto outputEdges(const Dependencies &, int depth) const;
+  [[nodiscard]] inline auto inputEdgeIDs(const Dependencies &) const
+    -> utils::VForwardRange;
+  [[nodiscard]] inline auto outputEdgeIDs(const Dependencies &) const
+    -> utils::VForwardRange;
+  [[nodiscard]] inline auto inputEdgeIDs(const Dependencies &, int depth) const;
+  [[nodiscard]] inline auto outputEdgeIDs(const Dependencies &,
+                                          int depth) const;
+  [[nodiscard]] inline auto unhoistableOutputs(const Dependencies &,
+                                               int depth) const;
 
+  [[nodiscard]] static auto zeroDim(Arena<> *alloc,
+                                    llvm::SCEVUnknown const *arrayPtr,
+                                    llvm::Instruction *loadOrStore,
+                                    unsigned numLoops) {
+    return alloc->create<Addr>(arrayPtr, loadOrStore, numLoops);
+  }
   /// Constructor for regular indexing
   [[nodiscard]] static auto
   construct(Arena<> *alloc, const llvm::SCEVUnknown *arrayPtr,
             llvm::Instruction *user, PtrMatrix<int64_t> indMat,
             std::array<llvm::SmallVector<const llvm::SCEV *, 3>, 2> szOff,
             PtrVector<int64_t> coffsets, int64_t *offsets, unsigned numLoops,
-            unsigned maxNumLoops) -> NotNull<Addr> {
+            unsigned maxNumLoops) -> Valid<Addr> {
     // we don't want to hold any other pointers that may need freeing
     unsigned arrayDim = szOff[0].size(), nOff = szOff[1].size();
     size_t memNeeded = intMemNeeded(maxNumLoops, arrayDim);
@@ -272,24 +348,24 @@ class Addr : public Instruction {
       alloc->allocate<const llvm::SCEV *>(arrayDim + nOff + numLoops - 1);
     unsigned natDepth = numLoops;
     for (; natDepth; --natDepth)
-      if (math::anyNEZero(indMat(_, natDepth - 1))) break;
+      if (math::anyNEZero(indMat[_, natDepth - 1])) break;
     auto *ma = new (mem) Addr(arrayPtr, user, offsets, syms,
                               std::array<unsigned, 2>{arrayDim, nOff}, numLoops,
                               natDepth, maxNumLoops);
     std::copy_n(szOff[0].begin(), arrayDim, syms);
     std::copy_n(szOff[1].begin(), nOff, syms + arrayDim);
-    ma->indexMatrix() << indMat(_, _(0, natDepth)); // naturalDepth
+    ma->indexMatrix() << indMat[_, _(0, natDepth)]; // naturalDepth
     ma->getOffsetOmega() << coffsets;
     return ma;
   }
   /// copies `o` and decrements the last element
   /// it decrements, as we iterate in reverse order
   constexpr void setFusionOmega(MutPtrVector<int> o) {
-    invariant(o.size(), getCurrentDepth() + 1);
+    invariant(o.size(), ptrdiff_t(getCurrentDepth()) + 1);
     std::copy_n(o.begin(), getCurrentDepth(), getFusionOmega().begin());
     getFusionOmega().back() = o.back()--;
   }
-  [[nodiscard]] auto reload(Arena<> *alloc) -> NotNull<Addr> {
+  [[nodiscard]] auto reload(Arena<> *alloc) -> Valid<Addr> {
     size_t memNeeded = intMemNeeded(maxDepth, numDim);
     void *p = alloc->allocate(sizeof(Addr) + memNeeded * sizeof(int64_t));
     *static_cast<ValKind *>(p) = VK_Load;
@@ -305,10 +381,11 @@ class Addr : public Instruction {
     r->edgeOut = -1;
     return r;
   }
-  [[nodiscard]] auto getSizes() const -> PtrVector<const llvm::SCEV *> {
+  [[nodiscard]] constexpr auto getSizes() const
+    -> PtrVector<const llvm::SCEV *> {
     return {syms, numDim};
   }
-  [[nodiscard]] auto getSymbolicOffsets() const
+  [[nodiscard]] constexpr auto getSymbolicOffsets() const
     -> PtrVector<const llvm::SCEV *> {
     return {syms + numDim, numDynSym};
   }
@@ -316,7 +393,7 @@ class Addr : public Instruction {
     return v->getKind() <= VK_Stow;
   }
   [[nodiscard]] constexpr auto getArrayPointer() const
-    -> NotNull<const llvm::SCEVUnknown> {
+    -> Valid<const llvm::SCEVUnknown> {
     return basePointer;
   }
   [[nodiscard]] auto getType() const -> llvm::Type * {
@@ -324,12 +401,16 @@ class Addr : public Instruction {
   }
   [[nodiscard]] constexpr auto dependsOnIndVars(size_t d) -> bool {
     for (size_t i = 0, D = getArrayDim(); i < D; ++i)
-      if (anyNEZero(indexMatrix()(i, _(d, end)))) return true;
+      if (anyNEZero(indexMatrix()[i, _(d, end)])) return true;
     return false;
   }
-  [[nodiscard]] constexpr auto getAffLoop() const -> NotNull<poly::Loop> {
+  [[nodiscard]] constexpr auto getAffLoop() const -> Valid<poly::Loop> {
     return loop;
   }
+  /// Get the value stored by this instruction.
+  /// invariant: this instruction must only be called if `Addr` is a store!
+  /// For a load, use `getUsers()` to get a range of the users.
+  /// Returns the parent (other than predicates).
   [[nodiscard]] constexpr auto getStoredVal() const -> Value * {
     invariant(isStore());
     return users.getVal();
@@ -351,6 +432,10 @@ class Addr : public Instruction {
     invariant(Value::classof(n));
     predicate = static_cast<Value *>(n);
   }
+  /// Get the users of this load.
+  /// invariant: this instruction must only be called if `Addr` is a load!
+  /// For a store, use `getStoredVal()` to get the stored value.
+  /// Returns the children.
   [[nodiscard]] constexpr auto getUsers() -> Users & {
     invariant(isLoad());
     return users;
@@ -375,22 +460,22 @@ class Addr : public Instruction {
     MutPtrVector<const llvm::SCEV *> sym{getSymbolicOffsets()};
     offSym = alloc->allocate<int64_t>(size_t(numDynSym) * numDim);
     MutDensePtrMatrix<int64_t> offsMat{offsetMatrix()};
-    if (dynSymInd) offsMat(_, _(0, dynSymInd)) << oldOffsMat;
+    if (dynSymInd) offsMat[_, _(0, dynSymInd)] << oldOffsMat;
     llvm::Loop *L = loop->getLLVMLoop();
     for (unsigned d = loop->getNumLoops() - numToPeel; d--;)
       L = L->getParentLoop();
     for (size_t i = numToPeel; i;) {
       L = L->getParentLoop();
-      if (allZero(Rt(_, --i))) continue;
+      if (allZero(Rt[_, --i])) continue;
       // push the SCEV
       auto *iTyp = L->getInductionVariable(*SE)->getType();
       const llvm::SCEV *S = SE->getAddRecExpr(
         SE->getZero(iTyp), SE->getOne(iTyp), L, llvm::SCEV::NoWrapMask);
       if (const llvm::SCEV **j = std::ranges::find(sym, S); j != sym.end()) {
         --numDynSym;
-        offsMat(_, std::distance(sym.begin(), j)) += Rt(_, i);
+        offsMat[_, std::distance(sym.begin(), j)] += Rt[_, i];
       } else {
-        offsMat(_, dynSymInd) << Rt(_, i);
+        offsMat[_, dynSymInd] << Rt[_, i];
         sym[dynSymInd++] = S;
       }
     }
@@ -454,12 +539,14 @@ class Addr : public Instruction {
     return {getIntMemory() + 1, getArrayDim()};
   }
   /// indexMatrix() -> arrayDim() x getNumLoops()
+  /// First dimension is contiguous
   [[nodiscard]] constexpr auto indexMatrix() -> MutDensePtrMatrix<int64_t> {
-    return {indMatPtr(), DenseDims{getArrayDim(), getNaturalDepth()}};
+    return {indMatPtr(), DenseDims<>{{getArrayDim()}, {getNaturalDepth()}}};
   }
   /// indexMatrix() -> arrayDim() x getNumLoops()
+  /// First dimension is contiguous
   [[nodiscard]] constexpr auto indexMatrix() const -> DensePtrMatrix<int64_t> {
-    return {indMatPtr(), DenseDims{getArrayDim(), getNaturalDepth()}};
+    return {indMatPtr(), DenseDims<>{{getArrayDim()}, {getNaturalDepth()}}};
   }
   [[nodiscard]] constexpr auto getFusionOmega() -> MutPtrVector<int64_t> {
     unsigned L = getCurrentDepth() + 1;
@@ -475,15 +562,17 @@ class Addr : public Instruction {
   }
   [[nodiscard]] constexpr auto offsetMatrix() const -> DensePtrMatrix<int64_t> {
     invariant(offSym != nullptr || numDynSym == 0);
-    return {offSym, DenseDims{getArrayDim(), numDynSym}};
+    return {offSym, DenseDims<>{{getArrayDim()}, {numDynSym}}};
   }
-  [[nodiscard]] constexpr auto getLoop() -> NotNull<poly::Loop> { return loop; }
-  [[nodiscard]] constexpr auto sizesMatch(NotNull<const Addr> x) const -> bool {
+  [[nodiscard]] constexpr auto getAffineLoop() -> Valid<poly::Loop> {
+    return loop;
+  }
+  [[nodiscard]] constexpr auto sizesMatch(Valid<const Addr> x) const -> bool {
     auto thisSizes = getSizes(), xSizes = x->getSizes();
     return std::equal(thisSizes.begin(), thisSizes.end(), xSizes.begin(),
                       xSizes.end());
   }
-  auto calculateCostContiguousLoadStore(llvm::TargetTransformInfo &TTI,
+  auto calculateCostContiguousLoadStore(const llvm::TargetTransformInfo &TTI,
                                         unsigned int vectorWidth)
     -> cost::RecipThroughputLatency {
     constexpr unsigned int addrSpace = 0;
@@ -507,11 +596,61 @@ class Addr : public Instruction {
                                 llvm::TargetTransformInfo::TCK_Latency)};
   }
 
-  auto getCost(llvm::TargetTransformInfo &TTI, cost::VectorWidth W)
-    -> cost::RecipThroughputLatency {
-    // TODO: cache?
-    return calculateCostContiguousLoadStore(TTI, W.getWidth());
-  }
+  /// RecipThroughput
+  struct Costs {
+    double contiguous;
+    double discontiguous;
+    double scalar;
+    constexpr auto operator+=(Costs c) -> Costs & {
+      contiguous += c.contiguous;
+      discontiguous += c.discontiguous;
+      scalar += c.scalar;
+      return *this;
+    }
+  };
+  auto calcCostContigDiscontig(const llvm::TargetTransformInfo &TTI,
+                               unsigned int vectorWidth) -> Costs {
+    constexpr unsigned int addrSpace = 0;
+    llvm::Type *T = cost::getType(getType(), vectorWidth);
+    llvm::Align alignment = getAlign();
+
+    llvm::Intrinsic::ID id =
+      isLoad() ? llvm::Instruction::Load : llvm::Instruction::Store;
+
+    llvm::InstructionCost gsc{TTI.getGatherScatterOpCost(
+      id, T, basePointer->getValue(), predicate, alignment,
+      llvm::TargetTransformInfo::TCK_RecipThroughput)},
+      contig, scalar;
+
+    if (!predicate) {
+      contig =
+        TTI.getMemoryOpCost(id, T, alignment, addrSpace,
+                            llvm::TargetTransformInfo::TCK_RecipThroughput);
+      scalar =
+        TTI.getMemoryOpCost(id, T, alignment, addrSpace,
+                            llvm::TargetTransformInfo::TCK_RecipThroughput);
+    } else {
+      llvm::Intrinsic::ID mid =
+        isLoad() ? llvm::Intrinsic::masked_load : llvm::Intrinsic::masked_store;
+      contig = TTI.getMaskedMemoryOpCost(
+        mid, T, alignment, addrSpace,
+        llvm::TargetTransformInfo::TCK_RecipThroughput);
+      scalar = TTI.getMaskedMemoryOpCost(
+        mid, T, alignment, addrSpace,
+        llvm::TargetTransformInfo::TCK_RecipThroughput);
+    }
+    double dc{NAN}, dd{NAN}, ds{NAN};
+    if (std::optional<double> o = contig.getValue()) dc = *o;
+    if (std::optional<double> o = gsc.getValue()) dc = *o;
+    if (std::optional<double> o = scalar.getValue()) dc = *o;
+    return {dc, dd, ds};
+  }
+  inline auto reductionLatency(const llvm::TargetTransformInfo &TTI,
+                               unsigned vectorWidth)
+    -> llvm::InstructionCost::CostType;
+
+  /// drop `this` and remove it from `Dependencies`
+  inline void drop(Dependencies &);
 
   void printDotName(llvm::raw_ostream &os) const {
     if (isLoad()) os << "... = ";
@@ -524,7 +663,7 @@ class Addr : public Instruction {
       if (i) os << ", ";
       bool printPlus = false;
       for (ptrdiff_t j = 0; j < numLoops; ++j) {
-        if (int64_t Aji = A(i, j)) {
+        if (int64_t Aji = A[i, j]) {
           if (printPlus) {
             if (Aji <= 0) {
               Aji *= -1;
@@ -537,7 +676,7 @@ class Addr : public Instruction {
         }
       }
       for (ptrdiff_t j = 0; j < B.numCol(); ++j) {
-        if (int64_t offij = j ? B(i, j) : b[i]) {
+        if (int64_t offij = j ? B[i, j] : b[i]) {
           if (printPlus) {
             if (offij <= 0) {
               offij *= -1;
@@ -579,7 +718,7 @@ inline auto operator<<(llvm::raw_ostream &os, const Addr &m)
     if (i) os << ", ";
     bool printPlus = false;
     for (ptrdiff_t j = 0; j < numLoops; ++j) {
-      if (int64_t Aji = A(i, j)) {
+      if (int64_t Aji = A[i, j]) {
         if (printPlus) {
           if (Aji <= 0) {
             Aji *= -1;
@@ -592,7 +731,7 @@ inline auto operator<<(llvm::raw_ostream &os, const Addr &m)
       }
     }
     for (ptrdiff_t j = 0; j < offs.numCol(); ++j) {
-      if (int64_t offij = offs(i, j)) {
+      if (int64_t offij = offs[i, j]) {
         if (printPlus) {
           if (offij <= 0) {
             offij *= -1;
@@ -640,7 +779,7 @@ class AddrWrapper {
     return addr == other.addr;
   }
   [[nodiscard]] constexpr auto getLoop() const -> poly::Loop * {
-    return addr->getLoop();
+    return addr->getAffineLoop();
   }
   constexpr operator Addr *() { return addr; }
 };
@@ -652,7 +791,7 @@ class Load : public AddrWrapper {
   Load(Node *a)
     : AddrWrapper(a->getKind() == Node::VK_Load ? static_cast<Addr *>(a)
                                                 : nullptr) {}
-  [[nodiscard]] constexpr auto getInstruction() const -> llvm::Instruction * {
+  [[nodiscard]] auto getInstruction() const -> llvm::Instruction * {
     // could be load or store
     return llvm::cast<llvm::Instruction>(this->addr->getInstruction());
   }
@@ -664,7 +803,7 @@ class Stow : public AddrWrapper {
   Stow(Node *a)
     : AddrWrapper(a->getKind() == Node::VK_Stow ? static_cast<Addr *>(a)
                                                 : nullptr) {}
-  [[nodiscard]] constexpr auto getInstruction() const -> llvm::StoreInst * {
+  [[nodiscard]] auto getInstruction() const -> llvm::StoreInst * {
     // must be store
     return llvm::cast<llvm::StoreInst>(this->addr->getInstruction());
   }
diff --git a/include/IR/BBPredPath.hpp b/include/IR/BBPredPath.hpp
index d0121a5d7..22ca821f1 100644
--- a/include/IR/BBPredPath.hpp
+++ b/include/IR/BBPredPath.hpp
@@ -55,7 +55,7 @@ class Map {
   [[nodiscard]] auto operator[](llvm::Instruction *inst) -> std::optional<Set> {
     return (*this)[inst->getParent()];
   }
-  void insert(std::pair<llvm::BasicBlock *, Set> &&pair) {
+  void insert(containers::Pair<llvm::BasicBlock *, Set> &&pair) {
     map.insert(std::move(pair));
   }
   [[nodiscard]] auto contains(llvm::BasicBlock *BB) const -> bool {
diff --git a/include/IR/Cache.hpp b/include/IR/Cache.hpp
index 4ca7d45c2..508db3f6e 100644
--- a/include/IR/Cache.hpp
+++ b/include/IR/Cache.hpp
@@ -44,8 +44,8 @@ struct AddrChain {
   [[nodiscard]] constexpr auto getStores() const {
     Addr *S = (addr && addr->isStore()) ? addr : nullptr;
     return utils::ListRange(S, [](Addr *A) -> Addr * {
-      Addr *S = A->getNextAddr();
-      if (S && S->isStore()) return S;
+      Addr *W = A->getNextAddr();
+      if (W && W->isStore()) return W;
       return nullptr;
     });
   }
@@ -145,7 +145,7 @@ struct TreeResult {
   [[nodiscard]] constexpr auto getAddr() const { return addr.getAddr(); }
   [[nodiscard]] constexpr auto getLoads() const { return addr.getLoads(); }
   [[nodiscard]] constexpr auto getStores() const { return addr.getStores(); }
-  void setLoopNest(NotNull<poly::Loop> L) const {
+  void setLoopNest(Valid<poly::Loop> L) const {
     for (Addr *A : getAddr()) A->setLoopNest(L);
   }
   constexpr auto operator*=(TreeResult tr) -> TreeResult & {
@@ -156,7 +156,7 @@ struct TreeResult {
   }
 
   [[nodiscard]] constexpr auto getLoop() const -> poly::Loop * {
-    return (addr.addr) ? addr.addr->getLoop() : nullptr;
+    return (addr.addr) ? addr.addr->getAffineLoop() : nullptr;
   }
   [[nodiscard]] constexpr auto getMaxDepth() const -> unsigned {
     return maxDepth - rejectDepth;
@@ -176,7 +176,7 @@ class Cache {
   map<llvm::Value *, Value *> llvmToInternalMap;
   map<InstByValue, Compute *> instCSEMap;
   map<Cnst::Identifier, Cnst *> constMap;
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   llvm::LoopInfo *LI;
   llvm::ScalarEvolution *SE;
   Compute *freeInstList{nullptr}; // positive numOps/complete, but empty
@@ -196,7 +196,7 @@ class Cache {
   auto getCSE(Compute *I) -> Compute *& { return instCSEMap[InstByValue{I}]; }
   // NOLINTNEXTLINE(misc-no-recursion)
   auto createValue(llvm::Value *v, Predicate::Map *M, TreeResult tr, Value *&n)
-    -> std::pair<Value *, TreeResult> {
+    -> containers::Pair<Value *, TreeResult> {
     if (auto *i = llvm::dyn_cast<llvm::Instruction>(v))
       return createInstruction(i, M, tr, n);
     if (auto *c = llvm::dyn_cast<llvm::ConstantInt>(v))
@@ -205,6 +205,12 @@ class Cache {
       return {createConstant(c, n), tr};
     return {createConstantVal(v, n), tr};
   }
+  /// void replaceUsesByUsers(Value *oldNode, Value *newNode)
+  /// The name is confusing. This iterates through oldNode's users
+  /// (i.e. things using oldNode), and swaps the `oldNode` for `newNode`.
+  /// It checks if those users are `newNode` itself, if so, it does not modify.
+  /// This allows replacing `x` with `f(x)`, for example. That feature is used
+  /// for control flow merging.
   // NOLINTNEXTLINE(misc-no-recursion)
   constexpr void replaceUsesByUsers(Value *oldNode, Value *newNode) {
     invariant(oldNode->getKind() == Node::VK_Load ||
@@ -333,12 +339,12 @@ class Cache {
     return blackList | blackListAllDependentLoops(S, numPeeled);
   }
   static void extendDensePtrMatCols(Arena<> *alloc,
-                                    MutDensePtrMatrix<int64_t> &A, math::Row R,
-                                    math::Col C) {
+                                    MutDensePtrMatrix<int64_t> &A,
+                                    math::Row<> R, math::Col<> C) {
     MutDensePtrMatrix<int64_t> B{matrix<int64_t>(alloc, A.numRow(), C)};
     for (ptrdiff_t j = 0; j < R; ++j) {
-      B(j, _(0, A.numCol())) << A(j, _);
-      B(j, _(A.numCol(), end)) << 0;
+      B[j, _(0, A.numCol())] << A[j, _];
+      B[j, _(A.numCol(), end)] << 0;
     }
     std::swap(A, B);
   }
@@ -359,7 +365,7 @@ class Cache {
   /// complete the operands
   // NOLINTNEXTLINE(misc-no-recursion)
   auto complete(Compute *I, Predicate::Map *M, TreeResult tr)
-    -> std::pair<Compute *, TreeResult> {
+    -> containers::Pair<Compute *, TreeResult> {
     auto *i = I->getLLVMInstruction();
     unsigned nOps = I->numCompleteOps();
     auto ops = I->getOperands();
@@ -374,7 +380,7 @@ class Cache {
   }
   // update list of incomplets
   inline auto completeInstructions(Predicate::Map *M, TreeResult tr)
-    -> std::pair<Compute *, TreeResult> {
+    -> containers::Pair<Compute *, TreeResult> {
     Compute *completed = nullptr;
     for (Compute *I = tr.incomplete; I;
          I = static_cast<Compute *>(I->getNext())) {
@@ -393,7 +399,7 @@ class Cache {
   /// try to remove `I` as a duplicate
   /// this travels downstream;
   /// if `I` is eliminated, all users of `I`
-  /// get updated, making them CSE-candiates.
+  /// get updated, making them CSE-candidates.
   /// In this manner, we travel downstream through users.
   // NOLINTNEXTLINE(misc-no-recursion)
   auto cse(Compute *I) -> Compute * {
@@ -410,7 +416,7 @@ class Cache {
   /// updating the operands of all users of `oldNode`
   /// and the `users` of all operands of `oldNode`
   // NOLINTNEXTLINE(misc-no-recursion)
-  constexpr void replaceAllUsesWith(Instruction *oldNode, Value *newNode) {
+  void replaceAllUsesWith(Instruction *oldNode, Value *newNode) {
     invariant(oldNode->getKind() == Node::VK_Load ||
               oldNode->getKind() >= Node::VK_Func);
     replaceUsesByUsers(oldNode, newNode);
@@ -433,21 +439,21 @@ class Cache {
   /// `nullptr`, then all operands will be left incomplete.
   // NOLINTNEXTLINE(misc-no-recursion)
   auto getValue(llvm::Value *v, Predicate::Map *M, TreeResult tr)
-    -> std::pair<Value *, TreeResult> {
+    -> containers::Pair<Value *, TreeResult> {
     Value *&n = llvmToInternalMap[v];
     if (n) return {n, tr};
     // by reference, so we can update in creation
     return createValue(v, M, tr, n);
   }
   auto getValue(llvm::Instruction *I, Predicate::Map *M, TreeResult tr)
-    -> std::pair<Instruction *, TreeResult> {
+    -> containers::Pair<Instruction *, TreeResult> {
     auto [v, tret] = getValue(static_cast<llvm::Value *>(I), M, tr);
     return {llvm::cast<Instruction>(v), tret};
   }
 
   // NOLINTNEXTLINE(misc-no-recursion)
   auto createInstruction(llvm::Instruction *I, Predicate::Map *M, TreeResult tr,
-                         Value *&t) -> std::pair<Value *, TreeResult> {
+                         Value *&t) -> containers::Pair<Value *, TreeResult> {
     auto *load = llvm::dyn_cast<llvm::LoadInst>(I);
     auto *store = llvm::dyn_cast<llvm::StoreInst>(I);
     if (!load && !store) return createCompute(I, M, tr, t);
@@ -467,7 +473,7 @@ class Cache {
 
   // NOLINTNEXTLINE(misc-no-recursion)
   auto createCompute(llvm::Instruction *I, Predicate::Map *M, TreeResult tr,
-                     Value *&t) -> std::pair<Compute *, TreeResult> {
+                     Value *&t) -> containers::Pair<Compute *, TreeResult> {
     auto [id, kind] = Compute::getIDKind(I);
     int numOps = int(I->getNumOperands());
     Compute *n = std::construct_at(allocateInst(numOps), kind, I, id, -numOps);
@@ -483,12 +489,12 @@ class Cache {
   auto zeroDimRef(llvm::Instruction *loadOrStore,
                   llvm::SCEVUnknown const *arrayPtr, unsigned numLoops)
     -> Addr * {
-    return Addr::construct(&alloc, arrayPtr, loadOrStore, numLoops);
+    return Addr::zeroDim(&alloc, arrayPtr, loadOrStore, numLoops);
   }
   // create Addr
   auto getArrayRef(llvm::Instruction *loadOrStore, llvm::Loop *L,
                    llvm::Value *ptr, TreeResult tr)
-    -> std::pair<Value *, TreeResult> {
+    -> containers::Pair<Value *, TreeResult> {
     Value *&n = llvmToInternalMap[loadOrStore];
     if (n) return {n, tr};
     auto ret = createArrayRef(loadOrStore, L, ptr, tr);
@@ -497,14 +503,14 @@ class Cache {
   }
   // create Addr
   auto createArrayRef(llvm::Instruction *loadOrStore, llvm::Value *ptr,
-                      TreeResult tr) -> std::pair<Value *, TreeResult> {
+                      TreeResult tr) -> containers::Pair<Value *, TreeResult> {
     llvm::Loop *L = LI->getLoopFor(loadOrStore->getParent());
     return createArrayRef(loadOrStore, L, ptr, tr);
   }
   // create Addr
   auto createArrayRef(llvm::Instruction *loadOrStore, llvm::Loop *L,
                       llvm::Value *ptr, TreeResult tr)
-    -> std::pair<Value *, TreeResult> {
+    -> containers::Pair<Value *, TreeResult> {
     const auto *elSz = SE->getElementSize(loadOrStore);
     const llvm::SCEV *accessFn = SE->getSCEVAtScope(ptr, L);
     unsigned numLoops = L->getLoopDepth();
@@ -517,7 +523,7 @@ class Cache {
   auto createArrayRef(llvm::Instruction *loadOrStore,
                       const llvm::SCEV *accessFn, unsigned numLoops,
                       const llvm::SCEV *elSz, TreeResult tr)
-    -> std::pair<Value *, TreeResult> {
+    -> containers::Pair<Value *, TreeResult> {
     // https://llvm.org/doxygen/Delinearization_8cpp_source.html#l00582
 
     const llvm::SCEV *pb = SE->getPointerBase(accessFn);
@@ -535,27 +541,28 @@ class Cache {
     if (numDims == 0) return {zeroDimRef(loadOrStore, arrayPtr, 0), tr};
     unsigned numPeeled = tr.rejectDepth;
     numLoops -= numPeeled;
-    math::IntMatrix Rt{math::StridedDims{numDims, numLoops}, 0};
+    math::IntMatrix<math::StridedDims<>> Rt{
+      math::StridedDims<>{{numDims}, {numLoops}}, 0};
     llvm::SmallVector<const llvm::SCEV *, 3> symbolicOffsets;
     uint64_t blackList{0};
     math::Vector<int64_t> coffsets{unsigned(numDims), 0};
-    MutDensePtrMatrix<int64_t> offsMat{nullptr, DenseDims{numDims, 0}};
+    MutDensePtrMatrix<int64_t> offsMat{nullptr, DenseDims<>{{numDims}, {0}}};
     {
       math::Vector<int64_t> offsets;
       for (ptrdiff_t i = 0; i < numDims; ++i) {
         offsets << 0;
         blackList |=
-          fillAffineIndices(Rt(i, _), &coffsets[i], offsets, symbolicOffsets,
+          fillAffineIndices(Rt[i, _], &coffsets[i], offsets, symbolicOffsets,
                             subscripts[i], 1, numPeeled);
         if (offsets.size() > offsMat.numCol())
-          extendDensePtrMatCols(&alloc, offsMat, math::Row{i},
-                                math::Col{offsets.size()});
-        offsMat(i, _) << offsets;
+          extendDensePtrMatCols(&alloc, offsMat, math::Row<>{i},
+                                math::Col<>{offsets.size()});
+        offsMat[i, _] << offsets;
       }
     }
     size_t numExtraLoopsToPeel = 64 - std::countl_zero(blackList);
     Addr *op = Addr::construct(&alloc, arrayPtr, loadOrStore,
-                               Rt(_, _(numExtraLoopsToPeel, end)),
+                               Rt[_, _(numExtraLoopsToPeel, end)],
                                {std::move(sizes), std::move(symbolicOffsets)},
                                coffsets, offsMat.data(), numLoops, tr.maxDepth);
     tr.addAddr(op);
@@ -601,7 +608,7 @@ class Cache {
     return B;
   }
   auto similarCompute(Compute *A, PtrVector<Value *> ops) -> Compute * {
-    invariant(A->getNumOperands(), ops.size());
+    invariant(ptrdiff_t(A->getNumOperands()), ops.size());
     return createCompute(A->getOpId(), A->getKind(), ops, A->getType(),
                          A->getFastMathFlags());
   }
diff --git a/include/IR/ControlFlowMerging.hpp b/include/IR/ControlFlowMerging.hpp
index a0f8e73de..a228756a0 100644
--- a/include/IR/ControlFlowMerging.hpp
+++ b/include/IR/ControlFlowMerging.hpp
@@ -1,12 +1,13 @@
 #pragma once
 
+#include "Alloc/Arena.hpp"
 #include "Dicts/BumpMapSet.hpp"
 #include "IR/BBPredPath.hpp"
 #include "IR/Cache.hpp"
 #include "IR/Instruction.hpp"
 #include "IR/Predicate.hpp"
-#include "Utilities/Allocators.hpp"
 #include <Containers/BitSets.hpp>
+#include <Containers/Pair.hpp>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -57,7 +58,7 @@ struct MergingCost {
   // that is, if we're fusing c and d, we can make each point toward
   // what the other one was pointing to, in order to link the chains.
   amap<Instruction *, Instruction *> mergeMap;
-  math::BumpPtrVector<std::pair<Instruction *, Instruction *>> mergeList;
+  math::BumpPtrVector<containers::Pair<Instruction *, Instruction *>> mergeList;
   amap<Instruction *, aset<Instruction *> *> ancestorMap;
   llvm::InstructionCost cost;
 
@@ -146,8 +147,8 @@ struct MergingCost {
       H = mergeMap[H];
     }
   }
-  static constexpr auto popBit(uint8_t x) -> std::pair<bool, uint8_t> {
-    return {x & 1, x >> 1};
+  static constexpr auto popBit(uint8_t x) -> containers::Pair<bool, uint8_t> {
+    return {bool(x & 1), uint8_t(x >> 1)};
   }
 
   struct Allocate {
@@ -215,7 +216,7 @@ struct MergingCost {
     // select(p, f(a,b), f(c,d)) => f(select(p, a, c), select(p, b, d))
     // but we can often do better, e.g. we may have
     // select(p, f(a,b), f(c,b)) => f(select(p, a, c), b)
-    // additionally, we can check `I->associativeOperandsFlag()`
+    // additionally, we can check `I->commutativeOperandsFlag()`
     // select(p, f(a,b), f(c,a)) => f(a, select(p, b, c))
     // we need to figure out which operands we're merging with which,
     //
@@ -223,26 +224,26 @@ struct MergingCost {
     // arguments are merged, as this may be common when two
     // control flow branches have relatively similar pieces.
     // E.g., if b and c are already merged,
-    // and if `f`'s ops are associative, then we'd get
+    // and if `f`'s ops are commutative, then we'd get
     // select(p, f(a,b), f(c,a)) => f(a, b)
     // so we need to check if any operand pairs are merged with each other.
     // note `isMerged(a,a) == true`, so that's the one query we need to use.
     auto selector = init(selects, A, B);
     MutPtrVector<Value *> operandsA = A->getOperands();
     MutPtrVector<Value *> operandsB = B->getOperands();
-    size_t numOperands = operandsA.size();
+    ptrdiff_t numOperands = operandsA.size();
     assert(numOperands == operandsB.size());
     /// associate ops means `f(a, b) == f(b, a)`
-    uint8_t associativeOpsFlag = B->associativeOperandsFlag();
+    uint8_t commutativeOpsFlag = B->commutativeOperandsFlag();
     // For example,
     // we keep track of which operands we've already merged,
     // f(a, b), f(b, b)
     // we can't merge b twice!
-    for (size_t i = 0; i < numOperands; ++i) {
+    for (ptrdiff_t i = 0; i < numOperands; ++i) {
       auto *opA = A->getOperand(i);
       auto *opB = B->getOperand(i);
-      auto [assoc, assocFlag] = popBit(associativeOpsFlag);
-      associativeOpsFlag = assocFlag;
+      auto [assoc, assocFlag] = popBit(commutativeOpsFlag);
+      commutativeOpsFlag = assocFlag;
       if (opA == opB) continue;
       // if both operands were merged, we can ignore it's associativity
       if (isMerged(opB, opA)) {
@@ -251,7 +252,7 @@ struct MergingCost {
         continue;
       }
       if (!((assoc) && (assocFlag))) {
-        // this op isn't associative with any remaining
+        // this op isn't commutative with any remaining
         selector.select(i, opA, opB);
         continue;
       }
@@ -284,7 +285,7 @@ struct MergingCost {
     return unsigned(selector);
   }
 
-  void merge(Arena<> *alloc, llvm::TargetTransformInfo &TTI,
+  void merge(Arena<> *alloc, const llvm::TargetTransformInfo &TTI,
              unsigned int vectorBits, Instruction *A, Instruction *B) {
     mergeList.emplace_back(A, B);
     auto *aA = ancestorMap.find(B);
@@ -369,8 +370,8 @@ struct MergingCost {
 // NOLINTNEXTLINE(misc-no-recursion)
 inline void mergeInstructions(
   Arena<> *alloc, IR::Cache &cache, Predicate::Map &predMap,
-  llvm::TargetTransformInfo &TTI, unsigned int vectorBits,
-  amap<Instruction::Identifier, math::ResizeableView<Instruction *, unsigned>>
+  const llvm::TargetTransformInfo &TTI, unsigned int vectorBits,
+  amap<Instruction::Identifier, math::ResizeableView<Instruction *, ptrdiff_t>>
     opMap,
   amap<Instruction *, Predicate::Set> &valToPred,
   llvm::SmallVectorImpl<MergingCost *> &mergingCosts, Instruction *J,
@@ -428,7 +429,7 @@ inline void mergeInstructions(
   }
   // descendants aren't legal merge candidates, so push after merging
   if (vec.getCapacity() <= vec.size())
-    vec.reserve(alloc, std::max(unsigned(8), 2 * vec.size()));
+    vec.reserve(alloc, std::max(ptrdiff_t(8), 2 * vec.size()));
   vec.push_back(J);
   valToPred[J] = preds;
   // TODO: prune bad candidates from mergingCosts
@@ -445,20 +446,19 @@ inline void mergeInstructions(
 /// merging as it allocates a lot of memory that it can free when it is done.
 /// TODO: this algorithm is exponential in time and memory.
 /// Odds are that there's way smarter things we can do.
-[[nodiscard]] inline auto mergeInstructions(IR::Cache &cache,
-                                            Predicate::Map &predMap,
-                                            llvm::TargetTransformInfo &TTI,
-                                            Arena<> tAlloc, unsigned vectorBits,
-                                            TreeResult tr) -> TreeResult {
+[[nodiscard]] inline auto
+mergeInstructions(IR::Cache &cache, Predicate::Map &predMap,
+                  const llvm::TargetTransformInfo &TTI, Arena<> tAlloc,
+                  unsigned vectorBits, TreeResult tr) -> TreeResult {
   auto [completed, trret] = cache.completeInstructions(&predMap, tr);
   tr = trret;
   if (!predMap.isDivergent()) return tr;
   // there is a divergence in the control flow that we can ideally merge
-  amap<Instruction::Identifier, math::ResizeableView<Instruction *, unsigned>>
+  amap<Instruction::Identifier, math::ResizeableView<Instruction *, ptrdiff_t>>
     opMap{&tAlloc};
   amap<Instruction *, Predicate::Set> valToPred{&tAlloc};
   llvm::SmallVector<MergingCost *> mergingCosts;
-  mergingCosts.emplace_back(tAlloc);
+  mergingCosts.emplace_back(&tAlloc);
   // We search through incomplete instructions inside the predMap
   // this should yield all merge candidates.L
   for (auto *C = completed; C; C = static_cast<Compute *>(C->getNext())) {
diff --git a/include/IR/CostModeling.hpp b/include/IR/CostModeling.hpp
deleted file mode 100644
index 7f38643b7..000000000
--- a/include/IR/CostModeling.hpp
+++ /dev/null
@@ -1,658 +0,0 @@
-#pragma once
-
-// #include "./ControlFlowMerging.hpp"
-#include "Graphs/Graphs.hpp"
-#include "IR/Address.hpp"
-#include "LinearProgramming/LoopBlock.hpp"
-#include "LinearProgramming/ScheduledNode.hpp"
-#include "Polyhedra/Dependence.hpp"
-#include <Math/Array.hpp>
-#include <Math/Math.hpp>
-#include <Utilities/Allocators.hpp>
-#include <algorithm>
-#include <any>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <llvm/ADT/ArrayRef.h>
-#include <llvm/ADT/SmallPtrSet.h>
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/Analysis/TargetTransformInfo.h>
-#include <llvm/IR/BasicBlock.h>
-#include <llvm/IR/Constant.h>
-#include <llvm/IR/DerivedTypes.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/Instruction.h>
-#include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/Type.h>
-#include <llvm/Support/Allocator.h>
-#include <llvm/Support/Casting.h>
-#include <llvm/Support/raw_ostream.h>
-#include <string_view>
-#include <utility>
-
-namespace poly::CostModeling {
-
-class CPURegisterFile {
-  [[no_unique_address]] uint8_t maximumVectorWidth;
-  [[no_unique_address]] uint8_t numVectorRegisters;
-  [[no_unique_address]] uint8_t numGeneralPurposeRegisters;
-  [[no_unique_address]] uint8_t numPredicateRegisters;
-
-  // hacky check for has AVX512
-  static inline auto hasAVX512(llvm::LLVMContext &C,
-                               const llvm::TargetTransformInfo &TTI) -> bool {
-    return TTI.isLegalMaskedExpandLoad(
-      llvm::FixedVectorType::get(llvm::Type::getDoubleTy(C), 8));
-  }
-
-  static auto estimateNumPredicateRegisters(
-    llvm::LLVMContext &C, const llvm::TargetTransformInfo &TTI) -> uint8_t {
-    if (TTI.supportsScalableVectors()) return 8;
-    // hacky check for AVX512
-    if (hasAVX512(C, TTI)) return 7; // 7, because k0 is reserved for unmasked
-    return 0;
-  }
-  // returns vector width in bits, ignoring mprefer-vector-width
-  static auto estimateMaximumVectorWidth(llvm::LLVMContext &C,
-                                         const llvm::TargetTransformInfo &TTI)
-    -> uint8_t {
-    uint8_t twiceMaxVectorWidth = 2;
-    auto *f32 = llvm::Type::getFloatTy(C);
-    llvm::InstructionCost prevCost = TTI.getArithmeticInstrCost(
-      llvm::Instruction::FAdd,
-      llvm::FixedVectorType::get(f32, twiceMaxVectorWidth));
-    while (true) {
-      llvm::InstructionCost nextCost = TTI.getArithmeticInstrCost(
-        llvm::Instruction::FAdd,
-        llvm::FixedVectorType::get(f32, twiceMaxVectorWidth *= 2));
-      if (nextCost > prevCost) break;
-      prevCost = nextCost;
-    }
-    return 16 * twiceMaxVectorWidth;
-  }
-
-public:
-  CPURegisterFile(llvm::LLVMContext &C, const llvm::TargetTransformInfo &TTI) {
-    maximumVectorWidth = estimateMaximumVectorWidth(C, TTI);
-    numVectorRegisters = TTI.getNumberOfRegisters(true);
-    numGeneralPurposeRegisters = TTI.getNumberOfRegisters(false);
-    numPredicateRegisters = estimateNumPredicateRegisters(C, TTI);
-  }
-  [[nodiscard]] constexpr auto getNumVectorBits() const -> uint8_t {
-    return maximumVectorWidth;
-  }
-  [[nodiscard]] constexpr auto getNumVector() const -> uint8_t {
-    return numVectorRegisters;
-  }
-  [[nodiscard]] constexpr auto getNumScalar() const -> uint8_t {
-    return numGeneralPurposeRegisters;
-  }
-  [[nodiscard]] constexpr auto getNumPredicate() const -> uint8_t {
-    return numPredicateRegisters;
-  }
-};
-// struct CPUExecutionModel {};
-
-// Plan for cost modeling:
-// 1. Build Instruction graph
-// 2. Iterate over all PredicatedChains, merging instructions across branches
-// where possible
-// 3. Create a loop tree structure for optimization
-// 4. Create InstructionBlocks at each level.
-
-// void pushBlock(llvm::SmallPtrSet<llvm::Instruction *, 32> &trackInstr,
-//                llvm::SmallPtrSet<llvm::BasicBlock *, 32> &chainBBs,
-//                Predicates &pred, llvm::BasicBlock *BB) {
-//     assert(chainBBs.contains(block));
-//     chainBBs.erase(BB);
-//     // we only want to extract relevant instructions, i.e. parents of
-//     stores for (llvm::Instruction &instr : *BB) {
-//         if (trackInstr.contains(&instr))
-//             instructions.emplace_back(pred, instr);
-//     }
-//     llvm::Instruction *term = BB->getTerminator();
-//     if (!term)
-//         return;
-//     switch (term->getNumSuccessors()) {
-//     case 0:
-//         return;
-//     case 1:
-//         BB = term->getSuccessor(0);
-//         if (chainBBs.contains(BB))
-//             pushBlock(trackInstr, chainBBs, pred, BB);
-//         return;
-//     case 2:
-//         break;
-//     default:
-//         assert(false);
-//     }
-//     auto succ0 = term->getSuccessor(0);
-//     auto succ1 = term->getSuccessor(1);
-//     if (chainBBs.contains(succ0) && chainBBs.contains(succ1)) {
-//         // TODO: we need to fuse these blocks.
-
-//     } else if (chainBBs.contains(succ0)) {
-//         pushBlock(trackInstr, chainBBs, pred, succ0);
-//     } else if (chainBBs.contains(succ1)) {
-//         pushBlock(trackInstr, chainBBs, pred, succ1);
-//     }
-// }
-template <typename T> using Vec = math::ResizeableView<T, unsigned>;
-
-// TODO: instead of this, update in-place and ensure all Addr are over-allocated
-// to correspond with max depth?
-// Because we parse in reverse order, we have max possible depth of
-// `ScheduledNode`s using it at time we create.
-
-/// LoopTree
-/// A tree of loops, with an indexable vector of IR::Loop*s, to facilitate
-/// construction of the IR::Loop graph, from the fusion omegas
-class LoopTree {
-  // The root of this subtree
-  NotNull<IR::Loop> loop;
-  LoopTree *parent{nullptr}; // do we need this?
-  Vec<LoopTree *> children{};
-  unsigned depth{0};
-  // We do not need to know the previous loop, as dependencies between
-  // the `Addr`s and instructions will determine the ordering.
-  constexpr LoopTree(Arena<> *lalloc, LoopTree *parent_)
-    : loop{lalloc->create<IR::Loop>(parent_->depth + 1)}, parent(parent_),
-      depth(parent_->depth + 1) {
-    // allocate the root node, and connect it to parent's node, as well as
-    // previous loop of the same level.
-    loop->setParent(parent_->loop);
-  }
-  constexpr LoopTree(Arena<> *lalloc) : loop{lalloc->create<IR::Loop>(0)} {}
-
-public:
-  static auto root(Arena<> *salloc, Arena<> *lalloc) -> LoopTree * {
-    return new (salloc) LoopTree(lalloc);
-  }
-  // salloc: Short lived allocator, for the indexable `Vec`s
-  // Longer lived allocator, for the IR::Loop nodes
-  // NOLINTNEXTLINE(misc-no-recursion)
-  void addNode(Arena<> *salloc, Arena<> *lalloc, lp::ScheduledNode *node) {
-    if (node->getNumLoops() == depth) {
-      // Then it belongs here, and we add loop's dependencies.
-      // We only need to add deps to support SCC/top sort now.
-      // We also apply the rotation here.
-      // For dependencies in SCC iteration, only indvar deps get iterated.
-      auto [Pinv, denom] = math::NormalForm::scaledInv(node->getPhi());
-      NotNull<poly::Loop> affloop =
-        node->getLoopNest()->rotate(lalloc, Pinv, node->getOffset());
-      for (IR::Addr *m : node->localAddr()) {
-        m->rotate(affloop, Pinv, denom, node->getOffsetOmega(),
-                  node->getOffset());
-        loop->insertAfter(m);
-      }
-      return;
-    }
-    // we need to find the sub-loop tree to which we add `node`
-    ptrdiff_t idx = node->getFusionOmega(depth);
-    invariant(idx >= 0);
-    ptrdiff_t numChildren = children.size();
-    if (idx >= children.size()) {
-      if (idx >= children.getCapacity()) {
-        // allocate extra capacity
-        children.reserve(salloc, 2 * (idx + 1));
-      }
-      // allocate new nodes and resize
-      children.resize(idx + 1);
-      for (ptrdiff_t i = numChildren; i < idx + 1; ++i)
-        children[i] = new (salloc) LoopTree{lalloc, this};
-      numChildren = idx + 1;
-    }
-    children[idx]->addNode(salloc, lalloc, node);
-  }
-  constexpr auto getChildren() -> Vec<LoopTree *> { return children; }
-  constexpr auto getLoop() -> IR::Loop * { return loop; }
-};
-
-struct LoopDepSummary {
-  IR::Node *afterExit{nullptr};
-  IR::Addr *indexedByLoop{nullptr};
-  IR::Addr *notIndexedByLoop{nullptr};
-};
-struct LoopIndependent {
-  LoopDepSummary summary;
-  bool independent;
-  constexpr auto operator*=(LoopIndependent other) -> LoopIndependent & {
-    summary = other.summary;
-    independent = independent && other.independent;
-    return *this;
-  }
-};
-//
-
-// searches `N` and it's users for loop-independent users
-// this exits early if it finds a dependent user; we search everything
-// anyway, so we'll revist later anyway.
-// We return a `IR::Node *, bool` pair, where the `bool` is true if
-// `N` was loop independent.
-// We do this rather than something like returning a `nullptr`, as
-// we may have descended into instructions, found some users that are
-// but then also found some that are not; we need to return `false`
-// in this case, but we of course want to still return those we found.
-// NOLINTNEXTLINE(misc-no-recursion)
-inline auto searchLoopIndependentUsers(IR::Dependencies deps, IR::Loop *L,
-                                       IR::Node *N, uint8_t depth,
-                                       LoopDepSummary summary)
-  -> LoopIndependent {
-  if (N->dependsOnParentLoop()) return {summary, false};
-  if (llvm::isa<IR::Loop>(N)) return {summary, false};
-  if (IR::Loop *P = N->getLoop(); P != L)
-    return {summary, !(P && L->contains(P))};
-  LoopIndependent ret{summary, true};
-  auto *a = llvm::dyn_cast<IR::Addr>(N);
-  if (a) {
-    a->removeFromList();
-    if (a->indexedByInnermostLoop()) {
-      a->insertAfter(ret.summary.indexedByLoop);
-      ret.summary.indexedByLoop = a;
-      return {summary, false};
-    }
-    a->insertAfter(ret.summary.notIndexedByLoop);
-    ret.summary.notIndexedByLoop = a;
-    for (IR::Addr *m : a->outputAddrs(deps, depth)) {
-      ret *= searchLoopIndependentUsers(deps, L, m, depth, summary);
-      if (ret.independent) continue;
-      a->setDependsOnParentLoop();
-      return ret;
-    }
-  }
-  // if it isn't a Loop, must be an `Instruction`
-  IR::Value *I = llvm::cast<IR::Instruction>(N);
-  for (IR::Node *U : I->getUsers()) {
-    ret *= searchLoopIndependentUsers(deps, L, U, depth, summary);
-    if (ret.independent) continue;
-    I->setDependsOnParentLoop();
-    return ret;
-  }
-  // then we can push it to the front of the list, meaning it is hoisted out
-  if (a) {
-    if (ret.summary.notIndexedByLoop == a)
-      ret.summary.notIndexedByLoop = llvm::cast_or_null<IR::Addr>(a->getNext());
-  }
-  I->removeFromList();
-  I->insertAfter(ret.summary.afterExit);
-  ret.summary.afterExit = I;
-  I->visit(depth);
-  return ret;
-}
-// NOLINTNEXTLINE(misc-no-recursion)
-inline auto visitLoopDependent(IR::Dependencies deps, IR::Loop *L, IR::Node *N,
-                               uint8_t depth, IR::Node *body) -> IR::Node * {
-  invariant(N->getVisitDepth() != 254);
-  // N may have been visited as a dependent of an inner loop, which is why
-  // `visited` accepts a depth argument
-  if (N->wasVisited(depth) || !(L->contains(N))) return body;
-#ifndef NDEBUG
-  // Our goal here is to check for cycles in debug mode.
-  // Each level of our graph is acyclic, meaning that there are no cycles at
-  // that level when traversing only edges active at that given level. However,
-  // when considering edges active at level `I`, we may have cycles at level `J`
-  // if `J>I`. In otherwords, here we are travering all edges active at
-  // `I=depth`. Within subloops, which necessarilly have depth `J>I`, we may
-  // have cycles.
-  //
-  // Thus, we need to prevent getting stuck in a cycle for these deeper loops by
-  // setting `N->visit(depth)` here, so `wasVisited` will allow them to
-  // immediately return. But, in debug mode, we'll set nodes of the same depth
-  // to `254` to check for cycles.
-  if (N->getLoop() == L) N->visit(254);
-  else N->visit(depth);
-#else
-  N->visit(depth);
-#endif
-  // iterate over users
-  if (auto *A = llvm::dyn_cast<IR::Addr>(N)) {
-    for (IR::Addr *m : A->outputAddrs(deps, depth)) {
-      if (m->wasVisited(depth)) continue;
-      body = visitLoopDependent(deps, L, m, depth, body);
-    }
-  }
-  if (auto *I = llvm::dyn_cast<IR::Instruction>(N)) {
-    for (IR::Node *U : I->getUsers()) {
-      if (U->wasVisited(depth)) continue;
-      body = visitLoopDependent(deps, L, U, depth, body);
-    }
-  } else if (auto *S = llvm::dyn_cast<IR::Loop>(N)) {
-    for (IR::Node *U : S->getChild()->nodes()) {
-      if (U->wasVisited(depth)) continue;
-      body = visitLoopDependent(deps, L, U, depth, body);
-    }
-  }
-#ifndef NDEBUG
-  if (N->getLoop() == L) N->visit(depth);
-#endif
-  if (N->getLoop() == L) body = N->setNext(body);
-  return body;
-}
-inline auto topologicalSort(IR::Dependencies deps, IR::Loop *root,
-                            unsigned depth) -> IR::Node * {
-  // basic plan for the top sort:
-  // We iterate across all users, once all of node's users have been added,
-  // we push it to the front of the list. Thus, we get a top-sorted list.
-  // We're careful about the order, so that this top sort should LICM all the
-  // addresses that it can.
-  //
-  // We must push the exit before the root (as the exit depends on the loop, and
-  // we iterate users).
-  // The exit doesn't use any in this block, so we begin by trying to push any
-  // instructions that don't depend on the loop. If we fail to push them (i.e.,
-  // because they have uses that do depend on the loop), then they get added to
-  // a revisit queue. Any instructions we are able to push-front before we push
-  // the exit, implicitly happen after the exit, i.e. they have been LICMed into
-  // the exit block. We unvisit the revisit-queue, and add them back to the main
-  // worklist. Then, we proceed with a depth-first topological sort normally
-  // (iterating over uses, pushing to the front), starting with the loop root,
-  // so that it gets pushed to the front as soon as possible. That is, so that
-  // it happens as late as possible Any instructions that get pushed to the
-  // front afterwards have been LICMed into the loop pre-header.
-  //
-  // In this first pass, we iterate over all nodes, pushing those
-  // that can be hoisted after the exit block.
-  IR::Node *C = root->getChild();
-  LoopDepSummary summary;
-  for (IR::Node *N : C->nodes())
-    summary = searchLoopIndependentUsers(deps, root, N, depth, summary).summary;
-  // summary.afterExit will be hoisted out; every member has been marked as
-  // `visited` So, now we search all of root's users, i.e. every addr that
-  // depends on it
-  IR::Node *body;
-  for (IR::Node *N : summary.indexedByLoop->nodes())
-    body = visitLoopDependent(deps, root, N, depth, body);
-  body = root->setNext(body); // now we can place the loop
-  for (IR::Node *N : summary.notIndexedByLoop->nodes())
-    body = visitLoopDependent(deps, root, N, depth, body);
-  // and any remaining edges
-  return body;
-}
-// NOLINTNEXTLINE(misc-no-recursion)
-inline auto buildGraph(IR::Dependencies deps, IR::Loop *root, unsigned depth)
-  -> IR::Node * {
-  // We build the instruction graph, via traversing the tree, and then
-  // top sorting as we recurse out
-  for (IR::Loop *child : root->subLoops()) buildGraph(deps, child, depth + 1);
-  return topologicalSort(deps, root, depth);
-}
-
-inline auto addAddrToGraph(Arena<> *salloc, Arena<> *lalloc,
-                           lp::ScheduledNode *nodes) -> IR::Loop * {
-  auto s = salloc->scope();
-  LoopTree *root = LoopTree::root(salloc, lalloc);
-  for (lp::ScheduledNode *node : nodes->getAllVertices())
-    root->addNode(salloc, lalloc, node);
-  return root->getLoop();
-}
-inline void eliminateAddr(IR::Addr *a, IR::Addr *b) {
-  if (a->indexMatrix() != b->indexMatrix()) return;
-  /// are there any addr between them?
-  if (a->isStore()) {
-    if (b->isStore()) { // Write->Write
-      // Are there reads in between? If so, we must keep--
-      // --unless we're storing the same value twice (???)
-      // without other intervening store-edges.
-      // Without reads in between, it's safe.
-    } else { // Write->Read
-      // Can we replace the read with using the written value?
-      if (a->getLoop() != b->getLoop()) return;
-    }
-  } else if (b->isLoad()) { // Read->Read
-    // If they don't have the same parent, either...
-    // They're in different branches of loops, and load can't live
-    // in between them
-    // for (i : I){
-    //   for (j : J){
-    //     A[i,j];
-    //   }
-    //   for (j : J){
-    //     A[i,j];
-    //   }
-    // }
-    // or it is a subloop, but dependencies prevented us from hoisting.
-    if (a->getLoop() != b->getLoop()) return;
-    // Any writes in between them?
-  } // Read->Write, can't delete either
-}
-// plan: SCC? Iterate over nodes in program order?
-// then we can iterate in order.
-// What to do about depth?
-// We may have
-// for (i : I){
-//   for (j : J){
-//     A[j] = x; // store
-//     y = A[j]; // load
-//   }
-// }
-// In this case, we do have a cycle:
-// A[j]^s_i -> A[j]^l_i
-// A[j]^l_i -> A[j]^s_{i+1}
-// However, this cycle does not prohibit deleting the load,
-// replacing it with `y = x`.
-// This still holds true if the load were a second store:
-// for (i : I){
-//   for (j : J){
-//     A[j] = x; // store
-//     A[j] = y; // load
-//   }
-// }
-// We could stick with the single `y` store.
-// Thus, for eliminating memory operations at a depth of 2,
-// we are only concerned with dependencies still valid at a depth of 2.
-// for (int i = 0 : i < I; ++i){
-//   x[i] /= U[i,i];
-//   for (int j = i+1; j < I; ++j){
-//     x[j] -= x[i]*U[i,j];
-//   }
-// }
-// Maybe just do the dumb thing?
-// Walk the graph for addr costs, and at the same time,
-// check the addr for eliminability, checking against what we've stored thus
-// far.
-// We currently do not store load-load edges, which is why only checking
-// edge relationships is not ideal.
-// We may store load-load edges in the future, as these could be used as
-// part of the cost function of the linear program, i.e. we'd want to
-// minimize the distance between loads (but allow reordering them).
-//
-// I think a reasonable approach is:
-// Have a map from array pointer to Addr. Addrs form a chain.
-// as we walk the graph, add each newly encountered addr to the front of the
-// chain and check if we can eliminate it, or any of its predecessors.
-//
-// Note (bracketed means we might be able to eliminate):
-// Read->[Read] could eliminate read
-// Read->Write no change
-// Write->[Read] can forward written value
-// [Write]->Write can eliminate first write
-// Thus, we can fuse this pass with our address cost calculation.
-// We check if we can eliminate before calculating the new cost.
-// The only case where we may remove an old value, write->write,
-// we could just take the old cost and assign it to the new write.
-// TODO: if we have only writes to a non-escaping array, we should
-// be able to eliminate these writes too, and then also potentially
-// remove that array temporary (e.g., if it were malloc'd).
-// E.g. check if the array is a `llvm::isNonEscapingLocalObject` and allocated
-// by `llvm::isRemovableAlloc`.
-inline void removeRedundantAddr(IR::Dependencies deps, IR::Addr *addr) {
-  for (IR::Addr *a : addr->eachAddr()) {
-    for (poly::Dependence *d = a->getEdgeOut(); d; d = d->getNextOutput()) {
-      IR::Addr *b = d->output();
-      eliminateAddr(a, b);
-    }
-  }
-}
-//
-// Considering reordering legality, example
-// for (int i = 0: i < I; ++i){
-//   for (int j = 0 : j < i; ++j){
-//     x[i] -= x[j]*U[j,i];
-//   }
-//   x[i] /= U[i,i];
-// }
-// We have an edge from the store `x[i] = x[i] / U[i,i]=` to the load of
-// `x[j]`, when `j = ` the current `i`, on some future iteration.
-// We want to unroll;
-// for (int i = 0: i < I-3; i += 4){
-//   for (int j = 0 : j < i; ++j){
-//     x[i] -= x[j]*U[j,i];
-//     x[i+1] -= x[j]*U[j,i+1];
-//     x[i+2] -= x[j]*U[j,i+2];
-//     x[i+3] -= x[j]*U[j,i+3];
-//   }
-//   x[i] /= U[i,i]; // store 0
-//   { // perform unrolled j = i iter
-//     int j = i; // these all depend on store 0
-//     x[i+1] -= x[j]*U[j,i+1];
-//     x[i+2] -= x[j]*U[j,i+2];
-//     x[i+3] -= x[j]*U[j,i+3];
-//   }
-//   x[i+1] /= U[i+1,i+1]; // store 1
-//   { // perform unrolled j = i + 1 iter
-//     int j = i+1; // these all depend on store 1
-//     x[i+2] -= x[j]*U[j,i+2];
-//     x[i+3] -= x[j]*U[j,i+3];
-//   }
-//   x[i+2] /= U[i+2,i+2]; // store 2
-//   { // perform unrolled j = i + 2 iter
-//     int j = i+2; // this depends on store 2
-//     x[i+3] -= x[j]*U[j,i+3];
-//   }
-//   x[i+3] /= U[i+3,i+3];
-// }
-// The key to legality here is that we peel off the dependence polyhedra
-// from the loop's iteration space.
-// We can then perform the dependent iterations in order.
-// With masking, the above code can be vectorized in this manner.
-// The basic approach is that we have the dependence polyhedra:
-//
-// 0 <= i_s < I
-// 0 <= i_l < I
-// 0 <= j_l < i_l
-// i_s = j_l // dependence, yields same address in `x`
-//
-// Note that our schedule sets
-// i_s = i_l
-// Which gives:
-// i_l = i_s = j_l < i_l
-// a contradiction, meaning that the dependency is
-// conditionally (on our schedule) independent.
-// Excluding the `i_s = i_l` constraint from the
-// polyhedra gives us the region of overlap.
-//
-// When unrolling by `U`, we get using `U=4` as an example:
-// i^0_s + 1 = i^1_s
-// i^0_s + 2 = i^2_s
-// i^0_s + 3 = i^3_s
-// 0 <= i^0_s < I
-// 0 <= i^1_s < I
-// 0 <= i^2_s < I
-// 0 <= i^3_s < I
-// 0 <= i^0_l < I
-// 0 <= i^1_l < I
-// 0 <= i^2_l < I
-// 0 <= i^3_l < I
-// 0 <= j_l < i^0_l
-// 0 <= j_l < i^1_l
-// 0 <= j_l < i^2_l
-// 0 <= j_l < i^3_l
-// i^0_s = j_l ||  i^1_s = j_l || i^2_s = j_l || i^3_s = j_l
-// where the final union can be replaced with
-// i^0_s = j_l ||  i^0_s+1 = j_l || i^0_s+2 = j_l || i^0_s+3 = j_l
-// i^0_s <= j_1 <= i^0_s+3
-//
-// Similarly, we can compress the other inequalities...
-// 0 <= i^0_s < I - 3
-// 0 <= i^0_l < I - 3
-// 0 <= j_l < i^0_l
-// i^0_s <= j_1 <= i^0_s+3 // dependence region
-//
-// So, the parallel region is the union
-// i^0_s > j_1 || j_1 > i^0_s+3
-//
-// In this example, note that the region `j_1 > i^0_s+3` is empty
-// so we have one parallel region, and then one serial region.
-//
-///
-/// Optimize the schedule
-inline void optimize(IR::Cache &instr, Arena<> *lalloc,
-                     lp::LoopBlock::OptimizationResult res) {
-  /// we must build the IR::Loop
-  /// Initially, to help, we use a nested vector, so that we can index into it
-  /// using the fusion omegas. We allocate it with the longer lived `instr`
-  /// alloc, so we can checkpoint it here, and use alloc for other IR nodes.
-  Arena<> *salloc = instr.getAllocator();
-
-  IR::Node *N = buildGraph(addAddrToGraph(salloc, lalloc, res.nodes), 0);
-  // `N` is the head of the topologically sorted graph
-  // We now try to remove redundant memory operations
-
-  removeRedundantAddr(res.addr.addr);
-}
-
-/*
-// NOLINTNEXTLINE(misc-no-recursion)
-inline auto printSubDotFile(Arena<> *alloc, llvm::raw_ostream &out,
-                          map<LoopTreeSchedule *, std::string> &names,
-                          llvm::SmallVectorImpl<std::string> &addrNames,
-                          unsigned addrIndOffset, poly::Loop *lret)
--> poly::Loop * {
-poly::Loop *loop{nullptr};
-size_t j = 0;
-for (auto *addr : header.getAddr()) loop = addr->getAffLoop();
-for (auto &subTree : subTrees) {
-  // `names` might realloc, relocating `names[this]`
-  if (getDepth())
-    names[subTree.subTree] = names[this] + "SubLoop#" + std::to_string(j++);
-  else names[subTree.subTree] = "LoopNest#" + std::to_string(j++);
-  if (loop == nullptr)
-    for (auto *addr : subTree.exit.getAddr()) loop = addr->getAffLoop();
-  loop = subTree.subTree->printSubDotFile(alloc, out, names, addrNames,
-                                          addrIndOffset, loop);
-}
-const std::string &name = names[this];
-out << "\"" << name
-    << "\" [shape=plain\nlabel = <<table><tr><td port=\"f0\">";
-// assert(depth == 0 || (loop != nullptr));
-if (loop && (getDepth() > 0)) {
-  for (size_t i = loop->getNumLoops(), k = getDepth(); i > k;)
-    loop = loop->removeLoop(alloc, --i);
-  loop->pruneBounds(alloc);
-  loop->printBounds(out);
-} else out << "Top Level";
-out << "</td></tr>\n";
-size_t i = header.printDotNodes(out, 0, addrNames, addrIndOffset, name);
-j = 0;
-std::string loopEdges;
-for (auto &subTree : subTrees) {
-  std::string label = "f" + std::to_string(++i);
-  out << " <tr> <td port=\"" << label << "\"> SubLoop#" << j++
-      << "</td></tr>\n";
-  loopEdges += "\"" + name + "\":f" + std::to_string(i) + " -> \"" +
-               names[subTree.subTree] + "\":f0 [color=\"#ff0000\"];\n";
-  i = subTree.exit.printDotNodes(out, i, addrNames, addrIndOffset, name);
-}
-out << "</table>>];\n" << loopEdges;
-if (lret) return lret;
-if ((loop == nullptr) || (getDepth() <= 1)) return nullptr;
-return loop->removeLoop(alloc, getDepth() - 1);
-}
-
-inline void printDotFile(Arena<> *alloc, llvm::raw_ostream &out) {
-map<LoopTreeSchedule *, std::string> names;
-llvm::SmallVector<std::string> addrNames(numAddr_);
-names[this] = "toplevel";
-out << "digraph LoopNest {\n";
-auto p = alloc.scope();
-printSubDotFile(alloc, out, names, addrNames, subTrees.size(), nullptr);
-printDotEdges(out, addrNames);
-out << "}\n";
-}
-*/
-// class LoopForestSchedule : LoopTreeSchedule {
-//   [[no_unique_address]] Arena<> *allocator;
-// };
-} // namespace poly::CostModeling
diff --git a/include/IR/Hash.hpp b/include/IR/Hash.hpp
index 7e384075e..b5039fccf 100644
--- a/include/IR/Hash.hpp
+++ b/include/IR/Hash.hpp
@@ -1,6 +1,6 @@
 #pragma once
-#include "IR/Instruction.hpp"
 #include "IR/Node.hpp"
+#include <Utilities/Invariant.hpp>
 #include <ankerl/unordered_dense.h>
 #include <llvm/ADT/Hashing.h>
 
@@ -31,15 +31,17 @@ template <> struct ankerl::unordered_dense::hash<poly::IR::Cnst::Identifier> {
     case poly::IR::Node::VK_Bint:
       return combineHash(seed, llvm::hash_value(*x.payload.ci));
     default:
-      poly::invariant(x.kind == poly::IR::Node::VK_Bint);
+      poly::utils::invariant(x.kind == poly::IR::Node::VK_Bint);
       return combineHash(seed, llvm::hash_value(*x.payload.cf));
     }
   }
 };
 
-template <> struct ankerl::unordered_dense::hash<poly::IR::Identifier> {
+template <>
+struct ankerl::unordered_dense::hash<poly::IR::Instruction::Identifier> {
   using is_avalanching = void;
-  [[nodiscard]] auto operator()(poly::IR::Identifier const &x) const noexcept
+  [[nodiscard]] auto
+  operator()(poly::IR::Instruction::Identifier const &x) const noexcept
     -> uint64_t {
     using poly::Hash::combineHash, poly::Hash::getHash;
     uint64_t seed = getHash(x.kind);
@@ -52,27 +54,6 @@ template <> struct ankerl::unordered_dense::hash<poly::IR::Identifier> {
 ///
 template <> struct ankerl::unordered_dense::hash<poly::IR::InstByValue> {
   using is_avalanching = void;
-  [[nodiscard]] auto operator()(poly::IR::InstByValue const &x) const noexcept
-    -> uint64_t {
-    using poly::Hash::combineHash, poly::Hash::getHash, poly::containers::UList,
-      poly::IR::Value;
-    uint64_t seed = getHash(x.inst->getKind());
-    seed = combineHash(seed, getHash(x.inst->getType()));
-    seed = combineHash(seed, getHash(x.inst->getOpId()));
-    if (x.inst->isIncomplete())
-      return combineHash(seed, getHash(x.inst->getLLVMInstruction()));
-    uint8_t assocFlag = x.inst->associativeOperandsFlag();
-    // combine all operands
-    size_t offset = 0;
-    poly::PtrVector<Value *> operands = x.inst->getOperands();
-    if (assocFlag) {
-      poly::invariant(assocFlag, uint8_t(3));
-      // we combine hashes in a commutative way
-      seed = combineHash(seed, getHash(operands[0]) + getHash(operands[1]));
-      offset = 2;
-    }
-    for (auto B = operands.begin() + offset, E = operands.end(); B != E; ++B)
-      seed = combineHash(seed, getHash(*B));
-    return seed;
-  }
+  [[nodiscard]] inline auto
+  operator()(poly::IR::InstByValue const &x) const noexcept -> uint64_t;
 };
diff --git a/include/IR/Instruction.hpp b/include/IR/Instruction.hpp
index d23c78e21..cdf991a49 100644
--- a/include/IR/Instruction.hpp
+++ b/include/IR/Instruction.hpp
@@ -5,14 +5,12 @@
 #include "IR/InstructionCost.hpp"
 #include "IR/Node.hpp"
 #include "IR/Predicate.hpp"
+#include <Alloc/Arena.hpp>
 #include <Containers/UnrolledList.hpp>
 #include <Math/Array.hpp>
-#include <Utilities/Allocators.hpp>
 #include <algorithm>
-#include <concepts>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <llvm/ADT/APInt.h>
 #include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/IR/BasicBlock.h>
@@ -29,11 +27,10 @@
 #include <llvm/Support/Allocator.h>
 #include <llvm/Support/Casting.h>
 #include <llvm/Support/MathExtras.h>
-#include <utility>
 
 namespace poly {
-using math::PtrVector, math::MutPtrVector, utils::Arena, utils::invariant,
-  utils::NotNull;
+using math::PtrVector, math::MutPtrVector, alloc::Arena, utils::invariant,
+  utils::Valid;
 }; // namespace poly
 
 namespace poly::IR {
@@ -75,9 +72,10 @@ class Compute : public Instruction {
   llvm::Instruction *inst{nullptr};
   llvm::Type *type;
   llvm::Intrinsic::ID opId;          // unsigned
-  int numOperands;                   // negative means incomplete
   llvm::FastMathFlags fastMathFlags; // holds unsigned
   VectorizationCosts costs;
+  uint32_t loopIndepFlag;
+  int numOperands; // negative means incomplete
 #if !defined(__clang__) && defined(__GNUC__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpedantic"
@@ -92,10 +90,22 @@ class Compute : public Instruction {
 #pragma clang diagnostic pop
 #endif
 
+  static constexpr auto diffMask(ptrdiff_t smaller, ptrdiff_t larger)
+    -> uint32_t {
+    invariant(smaller <= larger);
+    invariant(larger < 32);
+    // return ((uint32_t(1) << (larger - smaller)) - 1) << smaller;
+    uint32_t umask = ((uint32_t(1) << larger) - 1),
+             lmask = ((uint32_t(1) << smaller) - 1);
+    return umask ^ lmask;
+  }
+  static constexpr auto diffMask(Value *v, ptrdiff_t depth) -> uint32_t {
+    return diffMask(v->getCurrentDepth(), depth);
+  }
+
 public:
   Compute(const Compute &) = delete;
-  constexpr Compute(ValKind k, llvm::Instruction *i, llvm::Intrinsic::ID id,
-                    int numOps)
+  Compute(ValKind k, llvm::Instruction *i, llvm::Intrinsic::ID id, int numOps)
     : Instruction(k), inst(i), type(i->getType()), opId(id),
       numOperands(numOps), fastMathFlags(i->getFastMathFlags()) {}
   constexpr Compute(ValKind k, llvm::Intrinsic::ID id, int numOps,
@@ -121,7 +131,7 @@ class Compute : public Instruction {
     return inst;
   }
   static auto getIDKind(llvm::Instruction *I)
-    -> std::pair<llvm::Intrinsic::ID, ValKind> {
+    -> containers::Pair<llvm::Intrinsic::ID, ValKind> {
     if (auto *c = llvm::dyn_cast<llvm::CallInst>(I)) {
       if (auto *J = llvm::dyn_cast<llvm::IntrinsicInst>(c))
         return {J->getIntrinsicID(), VK_Call};
@@ -150,9 +160,24 @@ class Compute : public Instruction {
   constexpr auto getOperands() -> MutPtrVector<Value *> {
     return {operands, numOperands};
   }
+  [[nodiscard]] constexpr auto getLoopIndepFlag() const {
+    return loopIndepFlag;
+  }
+  constexpr auto calcLoopDepFlag(ptrdiff_t depth) -> uint32_t {
+    this->currentDepth = depth;
+    loopIndepFlag = (1 << depth) - 1;
+    for (auto *op : getOperands())
+      if (auto *C = llvm::dyn_cast<Compute>(op))
+        loopIndepFlag &= C->getLoopIndepFlag() | diffMask(C, depth);
+      else if (auto *A = llvm::dyn_cast<Addr>(op))
+        loopIndepFlag &= A->getOrthAxes().indep | diffMask(C, depth);
+    return loopIndepFlag;
+  }
+  /// Get the arguments to this function
   [[nodiscard]] constexpr auto getOperands() const -> PtrVector<Value *> {
     return {const_cast<Value **>(operands), unsigned(numOperands)};
   }
+  /// Get the `i`th argument of this function
   [[nodiscard]] constexpr auto getOperand(size_t i) const -> Value * {
     return operands[i];
   }
@@ -167,6 +192,10 @@ class Compute : public Instruction {
   [[nodiscard]] auto allowsContract() const -> bool {
     return fastMathFlags.allowContract();
   }
+  [[nodiscard]] auto reassociableArgs() const -> uint32_t {
+    if (!fastMathFlags.allowReassoc()) return 0;
+    return isMulAdd() ? 0x4 : ((0x1 << numOperands) - 1);
+  }
   // Incomplete stores the correct number of ops it was allocated with as a
   // negative number. The primary reason for being able to check
   // completeness is for `==` checks and hashing.
@@ -181,7 +210,12 @@ class Compute : public Instruction {
     return (getKind() == VK_Call) && ((opId == llvm::Intrinsic::fmuladd) ||
                                       (opId == llvm::Intrinsic::fma));
   }
-  [[nodiscard]] auto associativeOperandsFlag() const -> uint8_t {
+  // Bitmask indicating which args are commutative
+  // E.g. `muladd(a, b, c)` returns `0x3`
+  // where the bitpattern is 11000000
+  // indicating that the first two arguments are commutative.
+  // That is, `muladd(a, b, c) == muladd(b, a, c)`.
+  [[nodiscard]] auto commuatativeOperandsFlag() const -> uint8_t {
     switch (getKind()) {
     case VK_Call: return (isMulAdd() || isCommutativeCall()) ? 0x3 : 0;
     case VK_Oprn:
@@ -210,7 +244,7 @@ class Compute : public Instruction {
     size_t offset = 0;
     auto opst = getOperands();
     auto opso = other.getOperands();
-    if (uint8_t flag = associativeOperandsFlag()) {
+    if (uint8_t flag = commuatativeOperandsFlag()) {
       invariant(flag, uint8_t(3));
       auto *ot0 = opst[0];
       auto *oo0 = opso[0];
@@ -225,15 +259,13 @@ class Compute : public Instruction {
     return true;
   }
 
-  /// fall back in case we need value operand
-  // [[nodiscard]] auto isValue() const -> bool { return id.isValue(); }
-  auto getCost(llvm::TargetTransformInfo &TTI, VectorWidth W)
+  auto getCost(const llvm::TargetTransformInfo &TTI, VectorWidth W)
     -> RecipThroughputLatency {
     RecipThroughputLatency c = costs[W];
     if (c.notYetComputed()) costs[W] = c = calcCost(TTI, W.getWidth());
     return c;
   }
-  [[nodiscard]] inline auto calcCost(llvm::TargetTransformInfo &TTI,
+  [[nodiscard]] inline auto calcCost(const llvm::TargetTransformInfo &TTI,
                                      unsigned vectorWidth)
     -> RecipThroughputLatency;
   [[nodiscard]] auto getType(unsigned int vectorWidth) const -> llvm::Type * {
@@ -260,14 +292,12 @@ class Compute : public Instruction {
   // which case it is free
   [[nodiscard]] inline auto allUsersAdditiveContract() const -> bool;
 
-}; // class Inst
+}; // class Compute
 
-struct InstByValue {
-  Compute *inst;
-  auto operator==(InstByValue const &other) const -> bool {
-    return *inst == *other.inst;
-  }
-};
+inline auto InstByValue::operator==(InstByValue const &other) const -> bool {
+  if (inst == other.inst) return true;
+  return *inst == *other.inst;
+}
 
 // some opaque function
 class OpaqueFunc {
@@ -284,8 +314,8 @@ class OpaqueFunc {
   auto getFunction() -> llvm::Function * {
     return ins->getLLVMInstruction()->getFunction();
   }
-  auto calcCallCost(llvm::TargetTransformInfo &TTI, unsigned int vectorWidth)
-    -> RecipThroughputLatency {
+  auto calcCallCost(const llvm::TargetTransformInfo &TTI,
+                    unsigned int vectorWidth) -> RecipThroughputLatency {
     llvm::Type *T = ins->getType(vectorWidth);
     llvm::SmallVector<llvm::Type *, 4> argTypes;
     for (auto *op : getOperands()) argTypes.push_back(op->getType(vectorWidth));
@@ -296,7 +326,7 @@ class OpaqueFunc {
       TTI.getCallInstrCost(getFunction(), T, argTypes,
                            llvm::TargetTransformInfo::TCK_Latency)};
   }
-  auto calcCallCost(llvm::TargetTransformInfo &TTI, llvm::Function *F,
+  auto calcCallCost(const llvm::TargetTransformInfo &TTI, llvm::Function *F,
                     unsigned int vectorWidth) -> RecipThroughputLatency {
     llvm::Type *T = ins->getType(vectorWidth);
     llvm::SmallVector<llvm::Type *, 4> argTypes;
@@ -335,8 +365,7 @@ class Operation {
   [[nodiscard]] constexpr auto getNumOperands() const -> unsigned {
     return ins->getNumOperands();
   }
-  [[nodiscard]] constexpr auto isInstruction(llvm::Intrinsic::ID opCode) const
-    -> bool {
+  [[nodiscard]] auto isInstruction(llvm::Intrinsic::ID opCode) const -> bool {
     return getOpCode() == opCode;
   }
   static auto isFMul(Node *n) -> bool {
@@ -440,8 +469,9 @@ class Operation {
   [[nodiscard]] auto getType(unsigned w) const -> llvm::Type * {
     return ins->getType(w);
   }
-  auto calcUnaryArithmeticCost(llvm::TargetTransformInfo &TTI,
-                               unsigned int vectorWidth) const
+  [[nodiscard]] auto
+  calcUnaryArithmeticCost(const llvm::TargetTransformInfo &TTI,
+                          unsigned int vectorWidth) const
     -> RecipThroughputLatency {
     auto op0info = ins->getOperandInfo(0);
     llvm::Type *T = getType(vectorWidth);
@@ -454,8 +484,9 @@ class Operation {
   [[nodiscard]] auto getInstruction() const -> llvm::Instruction * {
     return ins->getLLVMInstruction();
   }
-  auto calcBinaryArithmeticCost(llvm::TargetTransformInfo &TTI,
-                                unsigned int vectorWidth) const
+  [[nodiscard]] auto
+  calcBinaryArithmeticCost(const llvm::TargetTransformInfo &TTI,
+                           unsigned int vectorWidth) const
     -> RecipThroughputLatency {
     auto op0info = ins->getOperandInfo(0);
     auto op1info = ins->getOperandInfo(1);
@@ -477,8 +508,8 @@ class Operation {
     return isFcmp() ? llvm::CmpInst::BAD_FCMP_PREDICATE
                     : llvm::CmpInst::BAD_ICMP_PREDICATE;
   }
-  auto calcCmpSelectCost(llvm::TargetTransformInfo &TTI,
-                         unsigned int vectorWidth) const
+  [[nodiscard]] auto calcCmpSelectCost(const llvm::TargetTransformInfo &TTI,
+                                       unsigned int vectorWidth) const
     -> RecipThroughputLatency {
     llvm::Type *T = getType(vectorWidth),
                *cmpT = llvm::CmpInst::makeCmpResultType(T);
@@ -493,11 +524,12 @@ class Operation {
 
   /// for calculating the cost of a select when merging this instruction with
   /// another one.
-  auto selectCost(llvm::TargetTransformInfo &TTI,
-                  unsigned int vectorWidth) const -> llvm::InstructionCost {
+  [[nodiscard]] auto selectCost(const llvm::TargetTransformInfo &TTI,
+                                unsigned int vectorWidth) const
+    -> llvm::InstructionCost {
     return selectCost(TTI, getType(vectorWidth));
   }
-  static auto selectCost(llvm::TargetTransformInfo &TTI, llvm::Type *T)
+  static auto selectCost(const llvm::TargetTransformInfo &TTI, llvm::Type *T)
     -> llvm::InstructionCost {
     llvm::Type *cmpT = llvm::CmpInst::makeCmpResultType(T);
     // llvm::CmpInst::Predicate pred =
@@ -512,7 +544,8 @@ class Operation {
       llvm::Instruction::Select, T, cmpT, pred,
       llvm::TargetTransformInfo::TCK_RecipThroughput);
   }
-  auto getCastContext(llvm::TargetTransformInfo & /*TTI*/) const
+  [[nodiscard]] auto
+  getCastContext(const llvm::TargetTransformInfo & /*TTI*/) const
     -> llvm::TargetTransformInfo::CastContextHint {
     if (ins->operandIsLoad() || ins->userIsStore())
       return llvm::TargetTransformInfo::CastContextHint::Normal;
@@ -521,8 +554,9 @@ class Operation {
     // TODO: check for whether mask, interleave, or reversed is likely.
     return llvm::TargetTransformInfo::CastContextHint::None;
   }
-  auto calcCastCost(llvm::TargetTransformInfo &TTI,
-                    unsigned int vectorWidth) const -> RecipThroughputLatency {
+  [[nodiscard]] auto calcCastCost(const llvm::TargetTransformInfo &TTI,
+                                  unsigned int vectorWidth) const
+    -> RecipThroughputLatency {
     llvm::Type *srcT = cost::getType(getOperand(0)->getType(), vectorWidth),
                *dstT = getType(vectorWidth);
     llvm::TargetTransformInfo::CastContextHint ctx = getCastContext(TTI);
@@ -533,8 +567,8 @@ class Operation {
       TTI.getCastInstrCost(idt, dstT, srcT, ctx,
                            llvm::TargetTransformInfo::TCK_Latency)};
   }
-  auto calculateCostFAddFSub(llvm::TargetTransformInfo &TTI,
-                             unsigned int vectorWidth) const
+  [[nodiscard]] auto calculateCostFAddFSub(const llvm::TargetTransformInfo &TTI,
+                                           unsigned int vectorWidth) const
     -> RecipThroughputLatency {
     // TODO: allow not assuming hardware FMA support
     if ((isFMulOrFNegOfFMul(getOperand(0)) ||
@@ -543,15 +577,15 @@ class Operation {
       return {};
     return calcBinaryArithmeticCost(TTI, vectorWidth);
   }
-  auto calculateFNegCost(llvm::TargetTransformInfo &TTI,
-                         unsigned int vectorWidth) const
+  [[nodiscard]] auto calculateFNegCost(const llvm::TargetTransformInfo &TTI,
+                                       unsigned int vectorWidth) const
     -> RecipThroughputLatency {
 
     if (isFMul(getOperand(0)) && ins->allUsersAdditiveContract()) return {};
     return calcUnaryArithmeticCost(TTI, vectorWidth);
   }
 
-  [[nodiscard]] auto calcCost(llvm::TargetTransformInfo &TTI,
+  [[nodiscard]] auto calcCost(const llvm::TargetTransformInfo &TTI,
                               unsigned int vectorWidth) const
     -> RecipThroughputLatency {
     switch (getOpCode()) {
@@ -647,8 +681,8 @@ class Call {
   [[nodiscard]] auto getNumOperands() const -> size_t {
     return ins->getNumOperands();
   }
-  auto calcCallCost(llvm::TargetTransformInfo &TTI, unsigned int vectorWidth)
-    -> RecipThroughputLatency {
+  auto calcCallCost(const llvm::TargetTransformInfo &TTI,
+                    unsigned int vectorWidth) -> RecipThroughputLatency {
     llvm::Type *T = ins->getType(vectorWidth);
     llvm::SmallVector<llvm::Type *, 4> argTypes;
     for (auto *op : ins->getOperands())
@@ -663,7 +697,8 @@ class Call {
   }
 };
 
-inline auto Value::getCost(llvm::TargetTransformInfo &TTI, cost::VectorWidth W)
+inline auto Value::getCost(const llvm::TargetTransformInfo &TTI,
+                           cost::VectorWidth W)
   -> cost::RecipThroughputLatency {
   if (auto *a = llvm::dyn_cast<Addr>(this)) return a->getCost(TTI, W);
   invariant(getKind() >= VK_Func);
@@ -702,8 +737,8 @@ inline auto Value::getType() const -> llvm::Type * {
 inline auto Value::getType(unsigned w) const -> llvm::Type * {
   return cost::getType(getType(), w);
 }
-[[nodiscard]] inline auto Compute::calcCost(llvm::TargetTransformInfo &TTI,
-                                            unsigned vectorWidth)
+[[nodiscard]] inline auto
+Compute::calcCost(const llvm::TargetTransformInfo &TTI, unsigned vectorWidth)
   -> RecipThroughputLatency {
   if (auto op = Operation(this)) return op.calcCost(TTI, vectorWidth);
   if (auto call = Call(this)) return call.calcCallCost(TTI, vectorWidth);
@@ -750,9 +785,9 @@ inline auto Value::getType(unsigned w) const -> llvm::Type * {
   if (const auto *I = llvm::dyn_cast<Compute>(this)) return I->getNumOperands();
   return getKind() == VK_Stow;
 }
-[[nodiscard]] inline auto Value::associativeOperandsFlag() const -> uint8_t {
+[[nodiscard]] inline auto Value::commutativeOperandsFlag() const -> uint8_t {
   if (const auto *I = llvm::dyn_cast<Compute>(this))
-    return I->associativeOperandsFlag();
+    return I->commuatativeOperandsFlag();
   return 0;
 }
 [[nodiscard]] inline auto Value::getNumScalarBits() const -> unsigned int {
@@ -770,7 +805,7 @@ inline auto Value::getType(unsigned w) const -> llvm::Type * {
   if (auto *I = getInstruction()) return I->getParent();
   return nullptr;
 }
-[[nodiscard]] constexpr auto Instruction::getIdentifier() const
+[[nodiscard]] inline auto Instruction::getIdentifier() const
   -> Instruction::Identifier {
   llvm::Intrinsic::ID id;
   if (const auto *I = llvm::dyn_cast<Compute>(this)) id = I->getOpId();
@@ -804,4 +839,89 @@ inline void Instruction::setOperands(Arena<> *alloc,
 // llvm::Intrinsic::IndependentIntrinsics x = llvm::Intrinsic::sqrt;
 // llvm::Intrinsic::IndependentIntrinsics y = llvm::Intrinsic::sin;
 
+constexpr auto findComp(Addr *src, Compute *dst) -> bool;
+// NOLINTNEXTLINE misc-no-recursion
+constexpr auto find(Addr *src, Value *op) {
+  auto *c = llvm::dyn_cast<IR::Compute>(op);
+  return c && findComp(src, c);
+}
+
+/// Defined here, because we're using `Compute`
+// NOLINTNEXTLINE misc-no-recursion
+constexpr auto findComp(Addr *src, Compute *dst) -> bool {
+  return std::ranges::any_of(dst->getOperands(), [=](Value *op) -> bool {
+    if (op != src && !find(src, op)) return false;
+    static_cast<Instruction *>(op)->linkReductionDst(dst);
+    return true;
+  });
+}
+// from dst, search through operands for `src`
+// TODO: accumulate latency as we go!
+// Maybe store visited, to avoid potentially revisiting?
+// NOLINTNEXTLINE misc-no-recursion
+constexpr auto findThroughReassociable(Addr *src, Compute *dst) -> unsigned {
+  invariant(src->isLoad());
+  uint32_t reassociable = dst->reassociableArgs();
+  // foundflag&1 == found reassociable
+  // foundflag&2 == found non-reassociable
+  unsigned foundflag = 0;
+  for (Value *op : dst->getOperands()) {
+    auto *c = llvm::dyn_cast<IR::Compute>(op);
+    bool found{false};
+    if (reassociable & 1) {
+      if (op == src) {
+        foundflag |= 1;
+        found = true;
+      } else if (c) {
+        unsigned f = findThroughReassociable(src, c);
+        if (!f) continue;
+        foundflag |= f;
+        found = true;
+      }
+    } else if ((op == src) || (c && findComp(src, c))) {
+      found = true;
+      foundflag = 0x2;
+    }
+    if (found) static_cast<Instruction *>(op)->linkReductionDst(dst);
+    if (foundflag & 2) return 0x2;
+    reassociable >>= 1;
+  }
+  return foundflag;
+}
+
+inline auto Addr::reductionLatency(const llvm::TargetTransformInfo &TTI,
+                                   unsigned vectorWidth)
+  -> llvm::InstructionCost::CostType {
+  llvm::InstructionCost::CostType latency{0};
+  for (Instruction *d = getReductionDst(); d; d = d->getReductionDst())
+    if (Compute *c = llvm::dyn_cast<Compute>(d))
+      latency += c->calcCost(TTI, vectorWidth).latency;
+  return latency;
+}
+
 } // namespace poly::IR
+
+[[nodiscard]] inline auto
+ankerl::unordered_dense::hash<poly::IR::InstByValue>::operator()(
+  poly::IR::InstByValue const &x) const noexcept -> uint64_t {
+  using poly::Hash::combineHash, poly::Hash::getHash, poly::containers::UList,
+    poly::IR::Value;
+  uint64_t seed = getHash(x.inst->getKind());
+  seed = combineHash(seed, getHash(x.inst->getType()));
+  seed = combineHash(seed, getHash(x.inst->getOpId()));
+  if (x.inst->isIncomplete())
+    return combineHash(seed, getHash(x.inst->getLLVMInstruction()));
+  uint8_t assocFlag = x.inst->commuatativeOperandsFlag();
+  // combine all operands
+  size_t offset = 0;
+  poly::PtrVector<Value *> operands = x.inst->getOperands();
+  if (assocFlag) {
+    poly::invariant(assocFlag, uint8_t(3));
+    // we combine hashes in a commutative way
+    seed = combineHash(seed, getHash(operands[0]) + getHash(operands[1]));
+    offset = 2;
+  }
+  for (auto B = operands.begin() + offset, E = operands.end(); B != E; ++B)
+    seed = combineHash(seed, getHash(*B));
+  return seed;
+}
diff --git a/include/IR/InstructionCost.hpp b/include/IR/InstructionCost.hpp
index 8198ab7fa..2ce8a9b80 100644
--- a/include/IR/InstructionCost.hpp
+++ b/include/IR/InstructionCost.hpp
@@ -1,7 +1,8 @@
 #pragma once
 #include <Utilities/Invariant.hpp>
 #include <bit>
-#include <cstdint>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Type.h>
 #include <llvm/Support/InstructionCost.h>
 
 namespace poly::IR::cost {
diff --git a/include/IR/Node.hpp b/include/IR/Node.hpp
index c824b5f55..2427a9182 100644
--- a/include/IR/Node.hpp
+++ b/include/IR/Node.hpp
@@ -1,10 +1,12 @@
 #pragma once
 
+#include "Alloc/Arena.hpp"
 #include "Containers/UnrolledList.hpp"
 #include "IR/InstructionCost.hpp"
 #include "IR/Users.hpp"
+#include "Optimize/Legality.hpp"
 #include "Polyhedra/Loops.hpp"
-#include "Utilities/Allocators.hpp"
+#include "Support/Iterators.hpp"
 #include "Utilities/ListRanges.hpp"
 #include <Math/Array.hpp>
 #include <cstdint>
@@ -16,10 +18,12 @@
 #include <llvm/IR/Type.h>
 #include <llvm/IR/Value.h>
 #include <llvm/Support/Casting.h>
-#include <utility>
 
+namespace poly::poly {
+class Dependencies;
+} // namespace poly::poly
 namespace poly::IR {
-using utils::NotNull, utils::invariant, utils::Arena, containers::UList;
+using utils::Valid, utils::invariant, alloc::Arena, containers::UList;
 class Loop;
 /// We take an approach similar to LLVM's RTTI
 /// however, we want to take advantage of FAMs while having a "hieararchy"
@@ -76,7 +80,7 @@ class Loop;
 /// while (C){
 ///   // do stuff with `C`
 ///   C = C->getNext()
-///   C = (C || llvm::isa<Loop>(C)) ? C : C->getChild();
+///   C = (!C || llvm::isa<Loop>(C)) ? C : C->getChild();
 /// }
 /// ```
 /// IR types: Loop, Block, Addr, Instr, Consts
@@ -87,6 +91,7 @@ class Node {
     VK_Load,
     VK_Stow, // used for ordered comparisons; all `Addr` types <= Stow
     VK_Loop,
+    VK_Exit,
     VK_CVal,
     VK_Cint,
     VK_Bint,
@@ -98,7 +103,7 @@ class Node {
   };
 
   // we have a private pointer so different types can share
-  // in manner not exacctly congruent with type hiearchy
+  // in manner not exacctly congruent with type hierarchy
   // in particular, `Inst` and `Load` want `User` lists
   // while `Stow`s do not.
   // `Addr` is the common load/store subtype
@@ -106,19 +111,20 @@ class Node {
   // but only load to inherit 'hasUsers' and only store to inherit the operand.
   // `Inst` would also inherit 'hasUsers', but would want a different operands
   // type.
-  // Addr has a FAM, so multiple inheritence isn't an option for `Load`/`Stow`,
+  // Addr has a FAM, so multiple inheritance isn't an option for `Load`/`Stow`,
   // and we want a common base that we can query to avoid monomorphization.
 protected:
   const ValKind kind;
+  /// The current position, `0` means top level, 1 inside a single loop
   uint8_t currentDepth{0}; // current depth
+  /// For an `Addr`, this is the "natural depth" where it would be
+  /// placed in a loop without dependencies, i.e., the inner mostindex
+  /// `0` means top level, `1` inside a single loop, etc
   uint8_t naturalDepth{0}; // original, or, for Addr, `indMat.numCol()`
   uint8_t visitDepth{255};
   uint8_t maxDepth; // memory allocated to support up to this depth
   bool dependsOnParentLoop_{false};
-  // 7 bytes; we have 1 left!
-  // uint16_t index_;
-  // uint16_t lowLink_;
-  // uint16_t bitfield;
+  uint16_t topologicalIndex{0};
 
   constexpr Node(ValKind kind_) : kind(kind_) {}
   constexpr Node(ValKind kind_, unsigned depth)
@@ -142,6 +148,7 @@ class Node {
     return visitDepth;
   }
   constexpr void clearVisited() { visitDepth = 255; }
+  /// bool wasVisited(uint8_t d) { return visitDepth == d; }
   [[nodiscard]] constexpr auto wasVisited(uint8_t d) const -> bool {
     return visitDepth == d;
   }
@@ -152,7 +159,14 @@ class Node {
   [[nodiscard]] constexpr auto sameBlock(const Node *other) const -> bool {
     return other && other->parent == parent && other->child == child;
   }
-
+  constexpr void setTopIndex(uint16_t idx) { topologicalIndex = idx; }
+  constexpr auto getTopIndex() const -> uint16_t { return topologicalIndex; }
+  constexpr auto isAfter(Node *v) const -> bool {
+    return topologicalIndex > v->getTopIndex();
+  }
+  constexpr auto isBefore(Node *v) const -> bool {
+    return topologicalIndex < v->getTopIndex();
+  }
   // [[nodiscard]] constexpr auto wasVisited() const -> bool {
   //   return bitfield & 0x1;
   // }
@@ -171,10 +185,10 @@ class Node {
   // [[nodiscard]] constexpr auto getIndex() const -> unsigned { return index_;
   // } constexpr void setIndex(unsigned i) { index_ = i; }
   [[nodiscard]] constexpr auto getKind() const -> ValKind { return kind; }
-  [[nodiscard]] constexpr auto getCurrentDepth() const -> unsigned {
+  [[nodiscard]] constexpr auto getCurrentDepth() const -> int {
     return currentDepth;
   }
-  [[nodiscard]] constexpr auto getNaturalDepth() const -> unsigned {
+  [[nodiscard]] constexpr auto getNaturalDepth() const -> int {
     return naturalDepth;
   }
 
@@ -202,7 +216,8 @@ class Node {
     if (n) n->child = this;
     return this;
   }
-  constexpr void setCurrentDepth(unsigned d) {
+  constexpr void setCurrentDepth(int d) {
+    invariant(d >= 0);
     invariant(d <= std::numeric_limits<decltype(currentDepth)>::max());
     currentDepth = d;
   }
@@ -259,6 +274,7 @@ class Node {
     if (llvm::isa<llvm::ConstantFP>(v)) return VK_Bflt;
     return VK_CVal;
   }
+  /// Iterate through all instructions
   [[nodiscard]] constexpr auto nodes() noexcept
     -> utils::ListRange<Node, utils::GetNext, utils::Identity> {
     return utils::ListRange{this, utils::GetNext{}};
@@ -274,21 +290,34 @@ static_assert(sizeof(Node) == 4 * sizeof(Node *) + 8);
 /// Loop
 /// parent: outer loop
 /// child: inner (sub) loop
+/// last is the last instruction in the body
 /// exit is the associated exit block
 class Loop : public Node {
   poly::Loop *affineLoop{nullptr};
+  Node *last{nullptr};
+  /// IDs are in topologically sorted order.
+  CostModeling::Legality legality{};
+  int32_t edgeId{-1}; // edge cycle id
+  // while `child` points to the first contained instruction,
+  // `last` points to the last contained instruction,
+  // and can be used for backwards iteration over the graph.
 
 public:
-  constexpr Loop(unsigned d) : Node(VK_Loop, d) {}
+  /// Get the IDs for the Dependencies carried by this loop
+  [[nodiscard]] constexpr auto edges(poly::PtrVector<int32_t> edges) const
+    -> utils::VForwardRange {
+    return utils::VForwardRange{edges, edgeId};
+  }
+  constexpr Loop(unsigned d) : Node{VK_Loop, d} {}
   constexpr Loop(unsigned d, poly::Loop *AL)
-    : Node(VK_Loop, d), affineLoop(AL) {}
+    : Node{VK_Loop, d}, affineLoop{AL} {}
   static constexpr auto classof(const Node *v) -> bool {
     return v->getKind() == VK_Loop;
   }
   /// Get the first subloop.
   [[nodiscard]] constexpr auto getSubLoop() const -> Loop * {
     Node *C = getChild();
-    C = (C || llvm::isa<Loop>(C)) ? C : C->getChild();
+    C = (!C || llvm::isa<Loop>(C)) ? C : C->getChild();
     return static_cast<Loop *>(C);
   }
   /// Return the enclosing, parent loop.
@@ -304,12 +333,13 @@ class Loop : public Node {
   }
   [[nodiscard]] constexpr auto subLoops() const {
     return utils::ListRange{getSubLoop(),
-                            [](Loop *L) { return L->getNextLoop(); }};
-  }
-  static constexpr auto create(Arena<> *alloc, poly::Loop *AL, size_t depth)
-    -> Loop * {
-    return alloc->create<Loop>(depth, AL);
+                            [](Loop *L) -> Loop * { return L->getNextLoop(); }};
   }
+  /// getLast()
+  /// Get the last node in the loop.
+  /// Useful for iterating backwards.
+  [[nodiscard]] constexpr auto getLast() const -> Node * { return last; }
+  constexpr void setLast(Node *n) { last = n; }
   [[nodiscard]] constexpr auto getLLVMLoop() const -> llvm::Loop * {
     return affineLoop->getLLVMLoop();
   }
@@ -323,22 +353,47 @@ class Loop : public Node {
   }
   // get the outermost subloop of `this` to which `N` belongs
   [[nodiscard]] constexpr auto getSubloop(IR::Node *N) -> Loop * {
-    Loop *L = N->getLoop();
+    Loop *L = N->getLoop(), *O;
     if (L == this) return this;
-    for (; L;) {
-      Loop *O = L->getOuterLoop();
+    for (; L; L = O) {
+      O = L->getOuterLoop();
       if (O == this) return L;
-      L = O;
     }
     return nullptr;
   }
+  [[nodiscard]] constexpr auto getEdge() const -> int32_t { return edgeId; }
+  constexpr void addEdge(math::MutPtrVector<int32_t> deps, int32_t d) {
+    invariant(d >= 0);
+    // [ -1, -1, -1, -1, -1 ] // d = 2, edgeId = -1
+    // [  2, -1, -1, -1, -1 ] // d = 0, edgeId = 2
+    // [  2, -1, -1, -1,  0 ] // d = 4, edgeId = 0
+    // now edgeId = 4, and we can follow path 4->0->2
+    deps[d] = std::exchange(edgeId, d);
+  }
+  constexpr auto getLoopAtDepth(uint8_t d) -> Loop * {
+    Loop *L = this;
+    for (uint8_t currDepth = this->currentDepth; currDepth > d; --currDepth)
+      L = L->getOuterLoop();
+    return L;
+  }
+  constexpr auto getLegality() -> CostModeling::Legality { return legality; }
+  inline void setLegality(CostModeling::LoopDepSatisfaction &deps);
 };
+
 [[nodiscard]] inline constexpr auto Node::getLoop() const noexcept -> Loop * {
-  if (!parent) return nullptr;
-  if (parent->kind != VK_Loop) return nullptr;
+  if (!parent || (parent->kind != VK_Loop)) return nullptr;
   return static_cast<Loop *>(parent);
 }
 
+/// This is used for convenience in top sort, but our canonical IR
+/// does not actually contain Exit nodes!
+struct Exit : Node {
+  Exit() : Node(VK_Exit) {}
+  static constexpr auto classof(const Node *v) -> bool {
+    return v->getKind() == VK_Exit;
+  }
+};
+
 class Instruction;
 
 class Value : public Node {
@@ -381,30 +436,6 @@ class Value : public Node {
     users.push_back(alloc, I);
   }
   constexpr void removeFromUsers(Instruction *I) { users.remove(I); }
-  // unionPtr methods
-  // [[nodiscard]] constexpr auto getUsers() const
-  //   -> const UList<Instruction *> * {
-  //   invariant(kind == VK_Load || kind >= VK_Func);
-  //   return unionPtr.users;
-  // }
-  // [[nodiscard]] constexpr auto getUsers() -> UList<Instruction *> * {
-  //   invariant(kind == VK_Load || kind >= VK_Func);
-  //   return unionPtr.users;
-  // }
-  // constexpr void setUsers(UList<Instruction *> *users) {
-  //   invariant(kind == VK_Load || kind >= VK_Func);
-  //   unionPtr.users = users;
-  // }
-  // constexpr void addUser(Arena<> *alloc, Instruction *n) {
-  //   invariant(kind == VK_Load || kind >= VK_Func);
-  //   if (!unionPtr.users)
-  //     unionPtr.users = alloc->create<UList<Instruction *>>(n);
-  //   else unionPtr.users = unionPtr.users->pushUnique(alloc, n);
-  // }
-  // constexpr void removeFromUsers(Instruction *n) {
-  //   invariant(kind == VK_Load || kind >= VK_Func);
-  //   unionPtr.users->eraseUnordered(n);
-  // }
 
   /// isStore() is true if the address is a store, false if it is a load
   /// If the memory access is a store, this can still be a reload
@@ -417,7 +448,7 @@ class Value : public Node {
 
   [[nodiscard]] inline auto getFastMathFlags() const -> llvm::FastMathFlags;
   /// these methods are overloaded for specific subtypes
-  inline auto getCost(llvm::TargetTransformInfo &TTI, cost::VectorWidth W)
+  inline auto getCost(const llvm::TargetTransformInfo &TTI, cost::VectorWidth W)
     -> cost::RecipThroughputLatency;
   [[nodiscard]] inline auto getValue() -> llvm::Value *;
   [[nodiscard]] inline auto getValue() const -> const llvm::Value *;
@@ -430,7 +461,7 @@ class Value : public Node {
   [[nodiscard]] inline auto getNumOperands() const -> unsigned;
   [[nodiscard]] inline auto getOperand(unsigned) -> Value *;
   [[nodiscard]] inline auto getOperand(unsigned) const -> const Value *;
-  [[nodiscard]] inline auto associativeOperandsFlag() const -> uint8_t;
+  [[nodiscard]] inline auto commutativeOperandsFlag() const -> uint8_t;
   [[nodiscard]] inline auto getNumScalarBits() const -> unsigned;
   [[nodiscard]] inline auto getNumScalarBytes() const -> unsigned;
   [[nodiscard]] inline auto getBasicBlock() -> llvm::BasicBlock *;
@@ -449,6 +480,7 @@ class Instruction : public Value {
   constexpr Instruction(ValKind kind_, unsigned curDepth, unsigned natDepth,
                         unsigned maxDepth_)
     : Value(kind_, curDepth, natDepth, maxDepth_) {}
+  Instruction *reductionDst{nullptr};
 
 public:
   static constexpr auto classof(const Node *v) -> bool {
@@ -458,11 +490,17 @@ class Instruction : public Value {
     llvm::Intrinsic::ID ID;
     Node::ValKind kind;
     llvm::Type *type;
+    constexpr auto operator==(const Identifier &other) const -> bool = default;
   };
   // declarations
-  [[nodiscard]] constexpr auto getIdentifier() const -> Identifier;
+  [[nodiscard]] auto getIdentifier() const -> Identifier;
   inline void setOperands(Arena<> *alloc, math::PtrVector<Value *>);
+  constexpr void linkReductionDst(Instruction *op) { reductionDst = op; }
+  constexpr auto getReductionDst() const -> Instruction * {
+    return reductionDst;
+  }
 };
+static_assert(std::is_copy_assignable_v<Instruction::Identifier>);
 
 /// CVal
 /// A constant value w/ respect to the loopnest.
@@ -487,7 +525,7 @@ class Cnst : public Value {
   llvm::Type *typ;
 
 protected:
-  constexpr Cnst(ValKind kind, llvm::Type *t) : Value(kind) { typ = t; }
+  constexpr Cnst(ValKind knd, llvm::Type *t) : Value(knd) { typ = t; }
 
 public:
   static constexpr auto classof(const Node *v) -> bool {
@@ -565,7 +603,7 @@ class Bint : public Cnst {
   const llvm::APInt &val;
 
 public:
-  constexpr Bint(llvm::ConstantInt *v, llvm::Type *t)
+  Bint(llvm::ConstantInt *v, llvm::Type *t)
     : Cnst(VK_Bint, t), val(v->getValue()) {}
   static constexpr auto create(Arena<> *alloc, llvm::ConstantInt *v,
                                llvm::Type *t) -> Bint * {
@@ -585,7 +623,7 @@ class Bflt : public Cnst {
   const llvm::APFloat &val;
 
 public:
-  constexpr Bflt(llvm::ConstantFP *v, llvm::Type *t)
+  Bflt(llvm::ConstantFP *v, llvm::Type *t)
     : Cnst(VK_Bflt, t), val(v->getValue()) {}
   static constexpr auto create(Arena<> *alloc, llvm::ConstantFP *v,
                                llvm::Type *t) -> Bflt * {
@@ -606,4 +644,10 @@ class Bflt : public Cnst {
   return false;
 }
 
+class Compute;
+struct InstByValue {
+  Compute *inst;
+  inline auto operator==(InstByValue const &other) const -> bool;
+};
+
 } // namespace poly::IR
diff --git a/include/IR/OrthogonalAxes.hpp b/include/IR/OrthogonalAxes.hpp
new file mode 100644
index 000000000..297d498f1
--- /dev/null
+++ b/include/IR/OrthogonalAxes.hpp
@@ -0,0 +1,23 @@
+#pragma once
+#ifndef OrthogonalAxes_hpp_INCLUDED
+#define OrthogonalAxes_hpp_INCLUDED
+
+#include <bit>
+#include <cstdint>
+
+/// `indep` must be `0` for any `invunrolls` it doesn't depend on
+struct OrthogonalAxes {
+  /// Boolean: Are the axes independent?
+  uint32_t indep_axes : 1;
+  /// Bit mask: are the axes contiguous?
+  uint32_t contig : 31; // max number of dims of 31
+  /// Flag indicating whether the axis is independent of loops
+  /// `1` per independent loops
+  uint32_t indep; // max loop depth of 32
+};
+static_assert(sizeof(OrthogonalAxes) == 8);
+constexpr auto operator==(OrthogonalAxes a, OrthogonalAxes b) -> bool {
+  return std::bit_cast<uint64_t>(a) == std::bit_cast<uint64_t>(b);
+}
+
+#endif // OrthogonalAxes_hpp_INCLUDED
diff --git a/include/IR/Predicate.hpp b/include/IR/Predicate.hpp
index 2c176e657..d290bc9a6 100644
--- a/include/IR/Predicate.hpp
+++ b/include/IR/Predicate.hpp
@@ -1,9 +1,11 @@
 #pragma once
 
+#include "Containers/UnrolledList.hpp"
 #include "Dicts/BumpVector.hpp"
+#include <Alloc/Arena.hpp>
 #include <Containers/TinyVector.hpp>
-#include <Utilities/Allocators.hpp>
 #include <Utilities/Invariant.hpp>
+#include <bit>
 #include <cstddef>
 #include <cstdint>
 #include <cwchar>
@@ -15,7 +17,6 @@
 #include <llvm/Pass.h>
 #include <llvm/Support/Allocator.h>
 #include <utility>
-#include <variant>
 
 namespace poly::IR {
 
@@ -49,18 +50,18 @@ struct Intersection {
   constexpr Intersection(size_t index, Relation value)
     : predicates(static_cast<uint64_t>(value) << (2 * index)) {}
   constexpr auto operator[](size_t index) const -> Relation {
-    assert(index < 32);
+    invariant(index < 32);
     return static_cast<Relation>((predicates >> (2 * (index))) & 3);
   }
   void set(size_t index, Relation value) {
-    assert(index < 32);
+    invariant(index < 32);
     index += index;
     uint64_t maskedOff = predicates & ~(3ULL << (index));
     predicates = maskedOff | static_cast<uint64_t>(value) << (index);
   }
   [[nodiscard]] auto intersect(size_t index, Relation value) const
     -> Intersection {
-    assert(index < 32);
+    invariant(index < 32);
     index += index;
     return {predicates | static_cast<uint64_t>(value) << (index)};
   }
@@ -153,7 +154,7 @@ struct Intersection {
     uint64_t mask = emptyMask(bitUnion);
     if (std::popcount(mask) == 1) { // a single b & !b case
       uint64_t remUnionMask =
-        ~(mask | (mask << 1));      // 0s `b`, meaning b can be either.
+        ~(mask | (mask << 1)); // 0s `b`, meaning b can be either.
       uint64_t w = remUnionMask & x;
       uint64_t z = remUnionMask & y;
       if (w == z) return {Intersection{w}};
@@ -281,7 +282,7 @@ struct Set {
       } else {
         allocated = true;
         intersectUnion.intersects =
-          alloc.create<containers::UList<Intersection>>();
+          alloc->create<containers::UList<Intersection>>();
         if (u.size() == 2) {
           intersectUnion.intersects->pushHasCapacity(u[0]);
           intersectUnion.intersects->pushHasCapacity(u[1]);
@@ -377,7 +378,7 @@ struct Set {
   [[nodiscard]] auto operator&=(Set &pred) -> Set & {
     if (!pred.allocated) return *this &= pred.intersectUnion.intersect;
     pred.intersectUnion.intersects->forEach(
-      [&](Intersection pred) { *this &= pred; });
+      [&](Intersection prd) { *this &= prd; });
     return *this;
   }
   auto copy(Arena<> *alloc) const -> Set {
diff --git a/include/IR/Users.hpp b/include/IR/Users.hpp
index ab67c9cf1..4f542f431 100644
--- a/include/IR/Users.hpp
+++ b/include/IR/Users.hpp
@@ -1,12 +1,11 @@
 #pragma once
 
 #include "Utilities/Invariant.hpp"
-#include <Utilities/Allocators.hpp>
-#include <bits/ranges_base.h>
+#include <Alloc/Arena.hpp>
 #include <limits>
 
 namespace poly::IR {
-using utils::Arena, utils::invariant;
+using alloc::Arena, utils::invariant;
 class Value;
 class Instruction;
 class Addr;
diff --git a/include/LinearProgramming/LoopBlock.hpp b/include/LinearProgramming/LoopBlock.hpp
index 3dc29e5d1..6910b4322 100644
--- a/include/LinearProgramming/LoopBlock.hpp
+++ b/include/LinearProgramming/LoopBlock.hpp
@@ -9,6 +9,7 @@
 #include "Polyhedra/DependencyPolyhedra.hpp"
 #include "Polyhedra/Loops.hpp"
 #include "Polyhedra/Schedule.hpp"
+#include <Alloc/Arena.hpp>
 #include <Containers/BitSets.hpp>
 #include <Math/Array.hpp>
 #include <Math/Comparisons.hpp>
@@ -17,12 +18,10 @@
 #include <Math/NormalForm.hpp>
 #include <Math/Simplex.hpp>
 #include <Math/StaticArrays.hpp>
-#include <Utilities/Allocators.hpp>
 #include <Utilities/Invariant.hpp>
 #include <Utilities/ListRanges.hpp>
 #include <Utilities/Valid.hpp>
 #include <algorithm>
-#include <bits/ranges_algo.h>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -138,8 +137,8 @@ class LoopBlock {
   // dict::map<llvm::User *, Addr *> userToMem{};
   // dict::set<llvm::User *> visited{};
   // llvm::LoopInfo *LI;
-  IR::Dependencies deps;
-  utils::OwningArena<> allocator{};
+  IR::Dependencies &deps;
+  alloc::Arena<> &allocator;
   // we may turn off edges because we've exceeded its loop depth
   // or because the dependence has already been satisfied at an
   // earlier level.
@@ -154,6 +153,9 @@ class LoopBlock {
   };
 
 public:
+  constexpr LoopBlock(IR::Dependencies &deps_, alloc::Arena<> &allocator_)
+    : deps(deps_), allocator(allocator_) {}
+
   struct OptimizationResult {
     IR::AddrChain addr;
     ScheduledNode *nodes;
@@ -166,7 +168,6 @@ class LoopBlock {
     }
   };
 
-  constexpr LoopBlock() = default;
   [[nodiscard]] auto optimize(IR::Cache &cache, IR::TreeResult tr)
     -> OptimizationResult {
     // first, we peel loops for which affine repr failed
@@ -194,8 +195,20 @@ class LoopBlock {
   [[nodiscard]] constexpr auto getAllocator() -> Arena<> * {
     return &allocator;
   }
+  [[nodiscard]] constexpr auto getDependencies() -> IR::Dependencies & {
+    return deps;
+  }
+  [[nodiscard]] constexpr auto getDependencies() const
+    -> const IR::Dependencies & {
+    return deps;
+  }
 
 private:
+  struct LoadSummary {
+    Value *store;
+    poly::Loop *deepestLoop;
+    IR::AddrChain ac;
+  };
   auto addScheduledNode(IR::Cache &cache, IR::Stow stow, IR::AddrChain addr)
     -> OptimizationResult {
     // how are we going to handle load duplication?
@@ -256,8 +269,7 @@ class LoopBlock {
   ///
   // NOLINTNEXTLINE(misc-no-recursion)
   auto searchOperandsForLoads(IR::Cache &cache, IR::Stow stow, Value *val,
-                              IR::AddrChain addr)
-    -> std::tuple<Value *, poly::Loop *, IR::AddrChain> {
+                              IR::AddrChain addr) -> LoadSummary {
     auto *inst = llvm::dyn_cast<Instruction>(val);
     if (!inst) return {val, nullptr, addr};
     // we use parent/child relationships here instead of next/prev
@@ -267,7 +279,7 @@ class LoopBlock {
       if (load.getParent() != nullptr) {
         Arena<> *alloc = cache.getAllocator();
         IR::Addr *reload = ((Addr *)load)->reload(alloc);
-        deps.copyDependencies(alloc, load, reload);
+        deps.copyDependencies(load, reload);
         invariant(reload->isLoad());
         load = reload;
         addr.addAddr(reload);
@@ -288,7 +300,7 @@ class LoopBlock {
       Addr *load = deps.reload(&allocator, store);
       stow.insertAfter(load); // insert load after stow
       addr.addAddr(load);
-      return {load, load->getLoop(), addr};
+      return {load, load->getAffineLoop(), addr};
     }
     auto *C = llvm::cast<IR::Compute>(inst);
     // could not find a load, so now we recurse, searching operands
@@ -311,6 +323,9 @@ class LoopBlock {
     return {val, maxLoop, addr};
   }
 
+  // We canonicalize offsets from `x[i - 1]` to `x[i]`, but being omega-shifted
+  // The LP minimizes omegas, which is intended to reduce distances. Thus, we
+  // want the distances to be reflected in the omegas.
   void shiftOmega(ScheduledNode *node) {
     unsigned nLoops = node->getNumLoops();
     if (nLoops == 0) return;
@@ -319,7 +334,8 @@ class LoopBlock {
     auto p1 = allocator.checkpoint();
     MutSquarePtrMatrix<int64_t> A =
       math::matrix<int64_t>(&allocator, nLoops + 1);
-    // BumpPtrVector<std::pair<BitSet64, int64_t>> omegaOffsets{allocator};
+    // BumpPtrVector<containers::Pair<BitSet64, int64_t>>
+    // omegaOffsets{allocator};
     // // we check all memory accesses in the node, to see if applying the same
     // omega offsets can zero dependence offsets. If so, we apply the shift.
     // we look for offsets, then try and validate that the shift
@@ -342,18 +358,18 @@ class LoopBlock {
             // input and output, no relative shift of shared loops possible
             // but indices may of course differ.
             for (ptrdiff_t d = 0; d < E.numRow(); ++d) {
-              MutPtrVector<int64_t> x = A(rank, _);
-              x[last] = E(d, 0);
+              MutPtrVector<int64_t> x = A[rank, _];
+              x[last] = E[d, 0];
               foundNonZeroOffset |= x[last] != 0;
               ptrdiff_t j = 0;
               for (; j < depCommon; ++j)
-                x[L - j] = E(d, j + numSyms) + E(d, j + numSyms + dep0);
+                x[L - j] = E[d, j + numSyms] + E[d, j + numSyms + dep0];
               if (dep0 != dep1) {
                 ptrdiff_t offset = dep0 > dep1 ? numSyms : numSyms + dep0;
-                for (; j < depMax; ++j) x[L - j] = E(d, j + offset);
+                for (; j < depMax; ++j) x[L - j] = E[d, j + offset];
               }
               for (; j < nLoops; ++j) x[L - j] = 0;
-              rank = math::NormalForm::updateForNewRow(A(_(0, rank + 1), _));
+              rank = math::NormalForm::updateForNewRow(A[_(0, rank + 1), _]);
             }
           } else {
             // dep between nodes
@@ -361,13 +377,13 @@ class LoopBlock {
             unsigned offset = dep.isForward() ? numSyms + dep0 : numSyms,
                      numDep = dep.isForward() ? dep1 : dep0;
             for (ptrdiff_t d = 0; d < E.numRow(); ++d) {
-              MutPtrVector<int64_t> x = A(rank, _);
-              x[last] = E(d, 0);
+              MutPtrVector<int64_t> x = A[rank, _];
+              x[last] = E[d, 0];
               foundNonZeroOffset |= x[last] != 0;
               ptrdiff_t j = 0;
-              for (; j < numDep; ++j) x[L - j] = E(d, j + offset);
+              for (; j < numDep; ++j) x[L - j] = E[d, j + offset];
               for (; j < nLoops; ++j) x[L - j] = 0;
-              rank = math::NormalForm::updateForNewRow(A(_(0, rank + 1), _));
+              rank = math::NormalForm::updateForNewRow(A[_(0, rank + 1), _]);
             }
           }
         }
@@ -381,13 +397,13 @@ class LoopBlock {
           unsigned offset = dep.isForward() ? numSyms : numSyms + dep0,
                    numDep = dep.isForward() ? dep0 : dep1;
           for (ptrdiff_t d = 0; d < E.numRow(); ++d) {
-            MutPtrVector<int64_t> x = A(rank, _);
-            x[last] = E(d, 0);
+            MutPtrVector<int64_t> x = A[rank, _];
+            x[last] = E[d, 0];
             foundNonZeroOffset |= x[last] != 0;
             ptrdiff_t j = 0;
-            for (; j < numDep; ++j) x[L - j] = E(d, j + offset);
+            for (; j < numDep; ++j) x[L - j] = E[d, j + offset];
             for (; j < nLoops; ++j) x[L - j] = 0;
-            rank = math::NormalForm::updateForNewRow(A(_(0, rank + 1), _));
+            rank = math::NormalForm::updateForNewRow(A[_(0, rank + 1), _]);
           }
         }
       }
@@ -397,14 +413,14 @@ class LoopBlock {
     // matrix A is reasonably diagonalized, should indicate
     ptrdiff_t c = 0;
     for (ptrdiff_t r = 0; r < rank; ++r) {
-      int64_t off = A(r, last);
+      int64_t off = A[r, last];
       if (off == 0) continue;
       for (; c < nLoops; ++c) {
-        if (A(r, c) != 0) break;
+        if (A[r, c] != 0) break;
         offs[L - c] = 0;
       }
       if (c == nLoops) return;
-      int64_t Arc = A(r, c), x = off / Arc;
+      int64_t Arc = A[r, c], x = off / Arc;
       if (x * Arc != off) continue;
       offs[L - c++] = x; // decrement loop `L-c` by `x`
       nonZero = true;
@@ -433,8 +449,8 @@ class LoopBlock {
             for (ptrdiff_t l = 0; l < numDep; ++l) {
               int64_t mlt = offs[l];
               if (mlt == 0) continue;
-              satL(0, _) -= mlt * satL(offset + l, _);
-              bndL(0, _) -= mlt * bndL(offset + l, _);
+              satL[0, _] -= mlt * satL[offset + l, _];
+              bndL[0, _] -= mlt * bndL[offset + l, _];
             }
             if (!repeat) break;
             repeat = false;
@@ -454,8 +470,8 @@ class LoopBlock {
           for (size_t l = 0; l < numDep; ++l) {
             int64_t mlt = offs[l];
             if (mlt == 0) continue;
-            satL(0, _) -= mlt * satL(offset + l, _);
-            bndL(0, _) -= mlt * bndL(offset + l, _);
+            satL[0, _] -= mlt * satL[offset + l, _];
+            bndL[0, _] -= mlt * bndL[offset + l, _];
           }
         }
       }
@@ -478,7 +494,7 @@ class LoopBlock {
           continue;
         ptrdiff_t r = math::NormalForm::rank(indMat);
         if (r == edge.getInCurrentDepth()) continue;
-        // TODO handle linearly dependent acceses, filtering them out
+        // TODO handle linearly dependent accesses, filtering them out
         if (r != ptrdiff_t(indMat.numRow())) continue;
         node->schedulePhi(indMat, r);
         tryOrth = true;
@@ -496,25 +512,25 @@ class LoopBlock {
     return math::SVector<unsigned, 4>{edge.getNumLambda(), edge.getDynSymDim(),
                                       edge.getNumConstraints(), 1};
   }
-  static constexpr auto countAuxParamsAndConstraints(IR::Dependencies deps,
-                                                     ScheduledNode *nodes,
-                                                     unsigned depth)
+  static constexpr auto
+  countAuxParamsAndConstraints(const IR::Dependencies &deps,
+                               ScheduledNode *nodes, int depth)
     -> math::SVector<unsigned, 4> {
     math::SVector<unsigned, 4> params{};
     assert(allZero(params));
     for (ScheduledNode *node : nodes->getVertices())
       for (Dependence d : node->inputEdges(deps))
-        if (d.isActive(depth)) params += numParams(d);
+        if (!d.isSat(depth)) params += numParams(d);
     return params;
   }
-  using BackupSchedule =
-    math::ResizeableView<std::pair<poly::AffineSchedule, ScheduledNode *>,
-                         unsigned>;
-  using BackupSat = math::ResizeableView<std::array<uint8_t, 2>, unsigned>;
-  using Backup = std::pair<BackupSchedule, BackupSat>;
+  using BackupSchedule = math::ResizeableView<
+    containers::Pair<poly::AffineSchedule, ScheduledNode *>, ptrdiff_t>;
+  using BackupSat = math::ResizeableView<std::array<uint8_t, 2>, ptrdiff_t>;
+  using Backup = containers::Pair<BackupSchedule, BackupSat>;
 
-  static constexpr auto
-  setScheduleMemoryOffsets(Dependencies deps, ScheduledNode *nodes, unsigned d)
+  static constexpr auto setScheduleMemoryOffsets(const Dependencies &deps,
+                                                 ScheduledNode *nodes,
+                                                 unsigned d)
     -> std::array<unsigned, 3> {
     // C, lambdas, omegas, Phis
     unsigned numOmegaCoefs = 0, numPhiCoefs = 0, numSlack = 0;
@@ -531,8 +547,8 @@ class LoopBlock {
     }
     return {numOmegaCoefs, numPhiCoefs, numSlack};
   }
-  static constexpr auto calcCoefs(Dependencies deps, ScheduledNode *nodes,
-                                  unsigned d) -> CoefCounts {
+  static constexpr auto calcCoefs(const Dependencies &deps,
+                                  ScheduledNode *nodes, int d) -> CoefCounts {
     auto [numOmegaCoefs, numPhiCoefs, numSlack] =
       setScheduleMemoryOffsets(deps, nodes, d);
     auto [numLambda, numBounding, numConstraints, numActiveEdges] =
@@ -542,11 +558,11 @@ class LoopBlock {
   }
 
   // NOLINTNEXTLINE(misc-no-recursion)
-  [[nodiscard]] auto optimize(ScheduledNode *nodes, unsigned d,
-                              unsigned maxDepth) -> Result {
+  [[nodiscard]] auto optimize(ScheduledNode *nodes, int d, int maxDepth)
+    -> Result {
     if (d >= maxDepth) return Result::independent();
     if (Result r = solveGraph(nodes, maxDepth, false)) {
-      unsigned descend = d + 1;
+      int descend = d + 1;
       if (descend == maxDepth) return r;
       if (Result n = optimize(nodes, descend, maxDepth)) {
         if ((r == Result::dependent()) &&
@@ -557,17 +573,17 @@ class LoopBlock {
     }
     return breakGraph(nodes, d);
   }
-  /// solveGraph(ScheduledNode *nodes, unsigned depth, bool satisfyDeps)
+  /// solveGraph(ScheduledNode *nodes, int depth, bool satisfyDeps)
   /// solve the `nodes` graph at depth `d`
   /// if `satisfyDeps` is true, then we are trying to satisfy dependencies at
   /// this level
   ///
-  [[nodiscard]] auto solveGraph(ScheduledNode *nodes, unsigned depth,
+  [[nodiscard]] auto solveGraph(ScheduledNode *nodes, int depth,
                                 bool satisfyDeps) -> Result {
     CoefCounts counts{calcCoefs(deps, nodes, depth)};
     return solveGraph(nodes, depth, satisfyDeps, counts);
   }
-  [[nodiscard]] auto solveGraph(ScheduledNode *nodes, unsigned depth,
+  [[nodiscard]] auto solveGraph(ScheduledNode *nodes, int depth,
                                 bool satisfyDeps, CoefCounts counts) -> Result {
     if (counts.numLambda == 0) {
       setSchedulesIndependent(nodes, depth);
@@ -587,7 +603,7 @@ class LoopBlock {
       nodes, depth, counts,
       sol[_(counts.numPhiCoefs + counts.numOmegaCoefs, end)]);
   }
-  void setSchedulesIndependent(ScheduledNode *nodes, unsigned depth) {
+  void setSchedulesIndependent(ScheduledNode *nodes, int depth) {
     // IntMatrix A, N;
     for (ScheduledNode *node : nodes->getVertices()) {
       if ((depth >= node->getNumLoops()) || node->phiIsScheduled(depth))
@@ -596,7 +612,7 @@ class LoopBlock {
       setDepFreeSchedule(node, depth);
     }
   }
-  static void setDepFreeSchedule(ScheduledNode *node, unsigned depth) {
+  static void setDepFreeSchedule(ScheduledNode *node, int depth) {
     node->getOffsetOmega(depth) = 0;
     if (node->phiIsScheduled(depth)) return;
     // we'll check the null space of the phi's so far
@@ -610,38 +626,39 @@ class LoopBlock {
     }
     // auto s = allocator->scope(); // TODO: use bumpalloc
     DenseMatrix<int64_t> nullSpace; // d x lfull
-    DenseMatrix<int64_t> A{node->getPhi()(_(0, depth), _).transpose()};
+    DenseMatrix<int64_t> A{node->getPhi()[_(0, depth), _].t()};
     math::NormalForm::nullSpace11(nullSpace, A);
-    invariant(unsigned(nullSpace.numRow()), node->getNumLoops() - depth);
+    invariant(ptrdiff_t(nullSpace.numRow()),
+              ptrdiff_t(node->getNumLoops()) - depth);
     // Now, we search index matrices for schedules not in the null space of
     // existing phi. This is because we're looking to orthogonalize a
     // memory access if possible, rather than setting a schedule
     // arbitrarily.
     // Here, we collect candidates for the next schedule
     DenseMatrix<int64_t> candidates{
-      math::DenseDims{0, node->getNumLoops() + 1}};
+      math::DenseDims<>{{0}, {node->getNumLoops() + 1}}};
     Vector<int64_t> indv;
     indv.resizeForOverwrite(node->getNumLoops());
     for (Addr *mem : node->localAddr()) {
       PtrMatrix<int64_t> indMat = mem->indexMatrix(); // lsub x d
       A.resizeForOverwrite(
         math::DenseDims{nullSpace.numRow(), indMat.numCol()});
-      A = nullSpace(_, _(0, indMat.numRow())) * indMat;
+      A = nullSpace[_, _(0, indMat.numRow())] * indMat;
       // we search A for rows that aren't all zero
       for (ptrdiff_t d = 0; d < A.numCol(); ++d) {
-        if (allZero(A(_, d))) continue;
-        indv << indMat(_, d);
+        if (allZero(A[_, d])) continue;
+        indv << indMat[_, d];
         bool found = false;
         for (ptrdiff_t j = 0; j < candidates.numRow(); ++j) {
-          if (candidates(j, _(0, last)) != indv) continue;
+          if (candidates[j, _(0, last)] != indv) continue;
           found = true;
-          ++candidates(j, 0);
+          ++candidates[j, 0];
           break;
         }
         if (!found) {
-          candidates.resize(candidates.numRow() + 1);
-          assert(candidates(last, 0) == 0);
-          candidates(last, _(1, end)) << indv;
+          candidates.resize(++auto{candidates.numRow()});
+          assert((candidates[last, 0]) == 0);
+          candidates[last, _(1, end)] << indv;
         }
       }
     }
@@ -650,21 +667,21 @@ class LoopBlock {
       // number of repetitions (which were placed in first index)
       ptrdiff_t i = 0;
       for (ptrdiff_t j = 1; j < candidates.numRow(); ++j)
-        if (candidates(j, _) > candidates(i, _)) i = j;
-      node->getSchedule(depth) << candidates(i, _(1, end));
+        if (candidates[j, _] > candidates[i, _]) i = j;
+      node->getSchedule(depth) << candidates[i, _(1, end)];
       return;
     }
     // do we want to pick the outermost original loop,
     // or do we want to pick the outermost lex null space?
     node->getSchedule(depth) << 0;
     for (ptrdiff_t c = 0; c < nullSpace.numCol(); ++c) {
-      if (allZero(nullSpace(_, c))) continue;
+      if (allZero(nullSpace[_, c])) continue;
       node->getSchedule(depth)[c] = 1;
       return;
     }
     invariant(false);
   }
-  void updateSchedules(ScheduledNode *nodes, unsigned depth, CoefCounts counts,
+  void updateSchedules(ScheduledNode *nodes, int depth, CoefCounts counts,
                        Simplex::Solution sol) {
 #ifndef NDEBUG
     if (counts.numPhiCoefs > 0)
@@ -705,14 +722,14 @@ class LoopBlock {
       if (!node->phiIsScheduled(depth)) {
         int64_t l = sol[node->getPhiOffsetRange() + o].denomLCM();
         for (ptrdiff_t i = 0; i < node->getPhi().numCol(); ++i)
-          assert(node->getPhi()(depth, i) ==
+          assert((node->getPhi()[depth, i]) ==
                  sol[node->getPhiOffsetRange() + o][i] * l);
       }
 #endif
     }
   }
-  [[nodiscard]] auto deactivateSatisfiedEdges(ScheduledNode *nodes,
-                                              unsigned depth, CoefCounts counts,
+  [[nodiscard]] auto deactivateSatisfiedEdges(ScheduledNode *nodes, int depth,
+                                              CoefCounts counts,
                                               Simplex::Solution sol) -> Result {
     if (allZero(sol[_(begin, counts.numBounding + counts.numActiveEdges)]))
       return checkEmptySatEdges(nodes, depth);
@@ -734,15 +751,15 @@ class LoopBlock {
     for (ScheduledNode *outNode : nodes->getVertices()) {
       for (Dependence edge : outNode->inputEdges(deps)) {
         if (edge.isInactive(depth)) continue;
-        Col uu = u + edge.getNumDynamicBoundingVar();
+        ptrdiff_t uu = u + edge.getNumDynamicBoundingVar();
         if ((sol[w++] != 0) || (anyNEZero(sol[_(u, uu)]))) {
           edge.setSatLevelLP(depth);
           result = Result::dependent();
         } else {
           ScheduledNode *inNode = edge.input()->getNode();
-          DensePtrMatrix<int64_t> inPhi = inNode->getPhi()(_(0, depth + 1), _),
+          DensePtrMatrix<int64_t> inPhi = inNode->getPhi()[_(0, depth + 1), _],
                                   outPhi =
-                                    outNode->getPhi()(_(0, depth + 1), _);
+                                    outNode->getPhi()[_(0, depth + 1), _];
           edge.checkEmptySat(&allocator, inNode->getLoopNest(),
                              inNode->getOffset(), inPhi, outNode->getLoopNest(),
                              outNode->getOffset(), outPhi);
@@ -752,14 +769,14 @@ class LoopBlock {
     }
     return result;
   }
-  auto checkEmptySatEdges(ScheduledNode *nodes, unsigned depth) -> Result {
+  auto checkEmptySatEdges(ScheduledNode *nodes, int depth) -> Result {
     for (ScheduledNode *outNode : nodes->getVertices()) {
       for (Dependence edge : outNode->inputEdges(deps)) {
         if (edge.isSat(depth)) continue;
         ScheduledNode *inNode = edge.input()->getNode();
         invariant(edge.output()->getNode(), outNode);
-        DensePtrMatrix<int64_t> inPhi = inNode->getPhi()(_(0, depth + 1), _),
-                                outPhi = outNode->getPhi()(_(0, depth + 1), _);
+        DensePtrMatrix<int64_t> inPhi = inNode->getPhi()[_(0, depth + 1), _],
+                                outPhi = outNode->getPhi()[_(0, depth + 1), _];
         edge.checkEmptySat(&allocator, inNode->getLoopNest(),
                            inNode->getOffset(), inPhi, outNode->getLoopNest(),
                            outNode->getOffset(), outPhi);
@@ -810,8 +827,8 @@ class LoopBlock {
         deps.satLevelPair(Dependence::ID{dID}) = sat[i++];
   }
   // NOLINTNEXTLINE(misc-no-recursion)
-  [[nodiscard]] auto optimizeSatDep(ScheduledNode *nodes, unsigned depth,
-                                    unsigned maxDepth, Result backupResult)
+  [[nodiscard]] auto optimizeSatDep(ScheduledNode *nodes, int depth,
+                                    int maxDepth, Result backupResult)
     -> Result {
     // if we're here, there are satisfied deps in both
     // depSatLevel and depSatNest
@@ -830,7 +847,7 @@ class LoopBlock {
     return backupResult;
   }
   // NOLINTNEXTLINE(misc-no-recursion)
-  auto tryFuse(ScheduledNode *n0, ScheduledNode *n1, unsigned depth) -> Result {
+  auto tryFuse(ScheduledNode *n0, ScheduledNode *n1, int depth) -> Result {
     auto s = allocator.scope();
     auto old0 = stashFit(n0); // FIXME: stash dep sat level
     auto old1 = stashFit(n1); // FIXME: stash dep sat level
@@ -842,7 +859,7 @@ class LoopBlock {
     popStash(old1);
     return Result::failure();
   }
-  auto satisfySplitEdges(ScheduledNode *nodes, unsigned depth) -> Result {
+  auto satisfySplitEdges(ScheduledNode *nodes, int depth) -> Result {
     auto s = allocator.scope();
     dict::aset<ScheduledNode *> graph{&allocator};
     for (ScheduledNode *node : nodes->getVertices()) graph.insert(node);
@@ -857,18 +874,18 @@ class LoopBlock {
     }
     return (found) ? Result::dependent() : Result::independent();
   }
-  auto solveSplitGraph(ScheduledNode *nodes, unsigned depth) -> Result {
+  auto solveSplitGraph(ScheduledNode *nodes, int depth) -> Result {
     Result sat = satisfySplitEdges(nodes, depth);
     Result opt = solveGraph(nodes, depth, false, calcCoefs(deps, nodes, depth));
     if (!opt) return opt;
     return opt & sat;
   }
   // NOLINTNEXTLINE(misc-no-recursion)
-  [[nodiscard]] auto breakGraph(ScheduledNode *node, unsigned d) -> Result {
+  [[nodiscard]] auto breakGraph(ScheduledNode *node, int d) -> Result {
     // Get a top sorting of SCC's; because we couldn't solve the graph
     // with these dependencies fused, we'll try splitting them.
     ScheduledNode *components =
-      graph::stronglyConnectedComponents(ScheduleGraph(d), node);
+      graph::stronglyConnectedComponents(ScheduleGraph(deps, d), node);
     if (components->getNextComponent() == nullptr) return {};
     // components are sorted in topological order.
     // We split all of them, solve independently,
@@ -917,9 +934,8 @@ class LoopBlock {
   /// Phis: scheduling rotations
   /// w: bounding offsets, independent of symbolic variables
   /// u: bounding offsets, dependent on symbolic variables
-  auto instantiateOmniSimplex(ScheduledNode *nodes, unsigned d,
-                              bool satisfyDeps, CoefCounts counts)
-    -> std::unique_ptr<Simplex> {
+  auto instantiateOmniSimplex(ScheduledNode *nodes, int d, bool satisfyDeps,
+                              CoefCounts counts) -> std::unique_ptr<Simplex> {
     auto [numOmegaCoefs, numPhiCoefs, numSlack, numLambda, numBounding,
           numConstraints, numActiveEdges] = counts;
     auto omniSimplex = Simplex::create(
@@ -933,9 +949,9 @@ class LoopBlock {
     // rows give constraints; each edge gets its own
     // numBounding = num u
     // numActiveEdges = num w
-    Row c = 0;
-    Col l = 1, o = 1 + numLambda + numSlack, p = o + numOmegaCoefs,
-        w = p + numPhiCoefs, u = w + numActiveEdges;
+    ptrdiff_t c = 0;
+    ptrdiff_t l = 1, o = 1 + numLambda + numSlack, p = o + numOmegaCoefs,
+              w = p + numPhiCoefs, u = w + numActiveEdges;
     for (ScheduledNode *inNode : nodes->getVertices()) {
       for (Dependence edge : inNode->outputEdges(deps, d)) {
         ScheduledNode *outNode = edge.output()->getNode();
@@ -948,26 +964,27 @@ class LoopBlock {
           bndO{edge.getBndOmegaCoefs()}, bndWU{edge.getBndCoefs()};
         const ptrdiff_t numSatConstraints = satC.size(),
                         numBndConstraints = bndC.size();
-        const Col nPc = satPc.numCol(), nPp = satPp.numCol();
-        invariant(nPc, bndPc.numCol());
-        invariant(nPp, bndPp.numCol());
-        Row cc = c + numSatConstraints;
-        Row ccc = cc + numBndConstraints;
+        const ptrdiff_t nPc = ptrdiff_t(satPc.numCol()),
+                        nPp = ptrdiff_t(satPp.numCol());
+        invariant(nPc, ptrdiff_t(bndPc.numCol()));
+        invariant(nPp, ptrdiff_t(bndPp.numCol()));
+        ptrdiff_t cc = c + numSatConstraints;
+        ptrdiff_t ccc = cc + numBndConstraints;
 
-        Col ll = l + satL.numCol();
-        Col lll = ll + bndL.numCol();
-        C(_(c, cc), _(l, ll)) << satL;
-        C(_(cc, ccc), _(ll, lll)) << bndL;
+        ptrdiff_t ll = l + ptrdiff_t(satL.numCol());
+        ptrdiff_t lll = ll + ptrdiff_t(bndL.numCol());
+        C[_(c, cc), _(l, ll)] << satL;
+        C[_(cc, ccc), _(ll, lll)] << bndL;
         l = lll;
         // bounding
-        C(_(cc, ccc), w++) << bndWU(_, 0);
-        Col uu = u + bndWU.numCol() - 1;
-        C(_(cc, ccc), _(u, uu)) << bndWU(_, _(1, end));
+        C[_(cc, ccc), w++] << bndWU[_, 0];
+        ptrdiff_t uu = u + ptrdiff_t(bndWU.numCol()) - 1;
+        C[_(cc, ccc), _(u, uu)] << bndWU[_, _(1, end)];
         u = uu;
         if (!satisfyDeps || !edge.stashedPreventsReordering(d))
-          C(_(c, cc), 0) << satC;
-        else C(_(c, cc), 0) << satC + satW;
-        C(_(cc, ccc), 0) << bndC;
+          C[_(c, cc), 0] << satC;
+        else C[_(c, cc), 0] << satC + satW;
+        C[_(cc, ccc), 0] << bndC;
         // now, handle Phi and Omega
         // phis are not constrained to be 0
         if (outNode == inNode) {
@@ -976,17 +993,17 @@ class LoopBlock {
               if (outNode->phiIsScheduled(d)) {
                 // add it constants
                 auto sch = outNode->getSchedule(d);
-                C(_(c, cc), 0) -=
-                  satPc * sch[_(0, nPc)] + satPp * sch[_(0, nPp)];
-                C(_(cc, ccc), 0) -=
-                  bndPc * sch[_(0, nPc)] + bndPp * sch[_(0, nPp)];
+                C[_(c, cc), 0] -=
+                  satPc * sch[_(0, nPc)].t() + satPp * sch[_(0, nPp)].t();
+                C[_(cc, ccc), 0] -=
+                  bndPc * sch[_(0, nPc)].t() + bndPp * sch[_(0, nPp)].t();
               } else {
                 // FIXME: phiChild = [14:18), 4 cols
                 // while Dependence seems to indicate 2
                 // loops why the disagreement?
                 auto po = outNode->getPhiOffset() + p;
-                C(_(c, cc), _(po, po + nPc)) << satPc + satPp;
-                C(_(cc, ccc), _(po, po + nPc)) << bndPc + bndPp;
+                C[_(c, cc), _(po, po + nPc)] << satPc + satPp;
+                C[_(cc, ccc), _(po, po + nPc)] << bndPc + bndPp;
               }
             } else if (outNode->phiIsScheduled(d)) {
               // add it constants
@@ -994,30 +1011,30 @@ class LoopBlock {
               // inner -> outer
               // so we need to drop inner most if one has less
               auto sch = outNode->getSchedule(d);
-              auto schP = sch[_(0, nPp)];
-              auto schC = sch[_(0, nPc)];
-              C(_(c, cc), 0) -= satPc * schC + satPp * schP;
-              C(_(cc, ccc), 0) -= bndPc * schC + bndPp * schP;
+              auto schP = sch[_(0, nPp)].t();
+              auto schC = sch[_(0, nPc)].t();
+              C[_(c, cc), 0] -= satPc * schC + satPp * schP;
+              C[_(cc, ccc), 0] -= bndPc * schC + bndPp * schP;
             } else if (nPc < nPp) {
               // Pp has more cols, so outer/leftmost overlap
               auto po = outNode->getPhiOffset() + p, poc = po + nPc,
                    pop = po + nPp;
-              C(_(c, cc), _(po, poc)) << satPc + satPp(_, _(0, nPc));
-              C(_(cc, ccc), _(po, poc)) << bndPc + bndPp(_, _(0, nPc));
-              C(_(c, cc), _(poc, pop)) << satPp(_, _(nPc, end));
-              C(_(cc, ccc), _(poc, pop)) << bndPp(_, _(nPc, end));
+              C[_(c, cc), _(po, poc)] << satPc + satPp[_, _(0, nPc)];
+              C[_(cc, ccc), _(po, poc)] << bndPc + bndPp[_, _(0, nPc)];
+              C[_(c, cc), _(poc, pop)] << satPp[_, _(nPc, end)];
+              C[_(cc, ccc), _(poc, pop)] << bndPp[_, _(nPc, end)];
             } else /* if (nPc > nPp) */ {
               auto po = outNode->getPhiOffset() + p, poc = po + nPc,
                    pop = po + nPp;
-              C(_(c, cc), _(po, pop)) << satPc(_, _(0, nPp)) + satPp;
-              C(_(cc, ccc), _(po, pop)) << bndPc(_, _(0, nPp)) + bndPp;
-              C(_(c, cc), _(pop, poc)) << satPc(_, _(nPp, end));
-              C(_(cc, ccc), _(pop, poc)) << bndPc(_, _(nPp, end));
+              C[_(c, cc), _(po, pop)] << satPc[_, _(0, nPp)] + satPp;
+              C[_(cc, ccc), _(po, pop)] << bndPc[_, _(0, nPp)] + bndPp;
+              C[_(c, cc), _(pop, poc)] << satPc[_, _(nPp, end)];
+              C[_(cc, ccc), _(pop, poc)] << bndPc[_, _(nPp, end)];
             }
-            C(_(c, cc), outNode->getOmegaOffset() + o)
-              << satO(_, 0) + satO(_, 1);
-            C(_(cc, ccc), outNode->getOmegaOffset() + o)
-              << bndO(_, 0) + bndO(_, 1);
+            C[_(c, cc), outNode->getOmegaOffset() + o]
+              << satO[_, 0] + satO[_, 1];
+            C[_(cc, ccc), outNode->getOmegaOffset() + o]
+              << bndO[_, 0] + bndO[_, 1];
           }
         } else {
           if (d < edge.getOutCurrentDepth())
@@ -1033,16 +1050,16 @@ class LoopBlock {
           if (d < edge.getOutCurrentDepth()) {
             if (d < edge.getInCurrentDepth())
               invariant(inNode->getOmegaOffset() != outNode->getOmegaOffset());
-            C(_(c, cc), outNode->getOmegaOffset() + o)
-              << satO(_, edge.isForward());
-            C(_(cc, ccc), outNode->getOmegaOffset() + o)
-              << bndO(_, edge.isForward());
+            C[_(c, cc), outNode->getOmegaOffset() + o]
+              << satO[_, edge.isForward()];
+            C[_(cc, ccc), outNode->getOmegaOffset() + o]
+              << bndO[_, edge.isForward()];
           }
           if (d < edge.getInCurrentDepth()) {
-            C(_(c, cc), inNode->getOmegaOffset() + o)
-              << satO(_, !edge.isForward());
-            C(_(cc, ccc), inNode->getOmegaOffset() + o)
-              << bndO(_, !edge.isForward());
+            C[_(c, cc), inNode->getOmegaOffset() + o]
+              << satO[_, !edge.isForward()];
+            C[_(cc, ccc), inNode->getOmegaOffset() + o]
+              << bndO[_, !edge.isForward()];
           }
         }
         c = ccc;
@@ -1056,25 +1073,26 @@ class LoopBlock {
   static void updateConstraints(MutPtrMatrix<int64_t> C,
                                 const ScheduledNode *node,
                                 PtrMatrix<int64_t> sat, PtrMatrix<int64_t> bnd,
-                                unsigned d, Row c, Row cc, Row ccc, Col p) {
+                                unsigned d, ptrdiff_t c, ptrdiff_t cc,
+                                ptrdiff_t ccc, ptrdiff_t p) {
     invariant(sat.numCol(), bnd.numCol());
     if (node->phiIsScheduled(d)) {
       // add it constants
-      auto sch = node->getSchedule(d)[_(0, sat.numCol())];
+      auto sch = node->getSchedule(d)[_(0, sat.numCol())].t();
       // order is inner <-> outer
       // so we need the end of schedule if it is larger
-      C(_(c, cc), 0) -= sat * sch;
-      C(_(cc, ccc), 0) -= bnd * sch;
+      C[_(c, cc), 0] -= sat * sch;
+      C[_(cc, ccc), 0] -= bnd * sch;
     } else {
       // add it to C
       auto po = node->getPhiOffset() + p;
-      C(_(c, cc), _(po, po + sat.numCol())) << sat;
-      C(_(cc, ccc), _(po, po + bnd.numCol())) << bnd;
+      C[_(c, cc), _(po, po + ptrdiff_t(sat.numCol()))] << sat;
+      C[_(cc, ccc), _(po, po + ptrdiff_t(bnd.numCol()))] << bnd;
     }
   }
-  void addIndependentSolutionConstraints(NotNull<Simplex> omniSimplex,
-                                         const ScheduledNode *nodes, unsigned d,
-                                         CoefCounts counts) {
+  void addIndependentSolutionConstraints(Valid<Simplex> omniSimplex,
+                                         const ScheduledNode *nodes,
+                                         ptrdiff_t d, CoefCounts counts) {
     // omniSimplex->setNumCons(omniSimplex->getNumCons() +
     //                                memory.size());
     // omniSimplex->reserveExtraRows(memory.size());
@@ -1086,9 +1104,9 @@ class LoopBlock {
       for (const ScheduledNode *node : nodes->getVertices()) {
         if (node->phiIsScheduled(d) || (!node->hasActiveEdges(deps, d)))
           continue;
-        C(i, 0) = 1;
-        C(i, node->getPhiOffsetRange() + o) << 1;
-        C(i++, ++s) = -1; // for >=
+        C[i, 0] = 1;
+        C[i, node->getPhiOffsetRange() + o] << 1;
+        C[i++, ++s] = -1; // for >=
       }
     } else {
       DenseMatrix<int64_t> A, N;
@@ -1096,21 +1114,22 @@ class LoopBlock {
         if (node->phiIsScheduled(d) || (d >= node->getNumLoops()) ||
             (!node->hasActiveEdges(deps, d)))
           continue;
-        A.resizeForOverwrite(Row{ptrdiff_t(node->getPhi().numCol())}, Col{d});
-        A << node->getPhi()(_(0, d), _).transpose();
+        A.resizeForOverwrite(Row<>{ptrdiff_t(node->getPhi().numCol())},
+                             Col<>{d});
+        A << node->getPhi()[_(0, d), _].t();
         math::NormalForm::nullSpace11(N, A);
         // we add sum(NullSpace,dims=1) >= 1
         // via 1 = sum(NullSpace,dims=1) - s, s >= 0
-        C(i, 0) = 1;
-        MutPtrVector<int64_t> cc{C(i, node->getPhiOffsetRange() + o)};
+        C[i, 0] = 1;
+        MutPtrVector<int64_t> cc{C[i, node->getPhiOffsetRange() + o]};
         // sum(N,dims=1) >= 1 after flipping row signs to be lex > 0
         for (ptrdiff_t m = 0; m < N.numRow(); ++m)
-          cc += N(m, _) * lexSign(N(m, _));
-        C(i++, ++s) = -1; // for >=
+          cc += N[m, _] * lexSign(N[m, _]);
+        C[i++, ++s] = -1; // for >=
       }
     }
     invariant(ptrdiff_t(omniSimplex->getNumCons()), i);
-    assert(!allZero(omniSimplex->getConstraints()(last, _)));
+    assert(!allZero(omniSimplex->getConstraints()[last, _]));
   }
   [[nodiscard]] static constexpr auto lexSign(PtrVector<int64_t> x) -> int64_t {
     for (auto a : x)
@@ -1118,7 +1137,6 @@ class LoopBlock {
     invariant(false);
     return 0;
   }
-
   //
   //
   //
@@ -1143,10 +1161,11 @@ class LoopBlock {
     return os;
   }
 };
-inline auto operator<<(llvm::raw_ostream &os,
-                       std::pair<ScheduledNode *, Dependencies> nodesdeps)
+inline auto
+operator<<(llvm::raw_ostream &os,
+           containers::Pair<ScheduledNode *, Dependencies *> nodesdeps)
   -> llvm::raw_ostream & {
-  auto [nodes, deps] = nodesdeps;
+  const auto &[nodes, deps] = nodesdeps;
   os << "\nLoopBlock graph:\n";
   size_t i = 0;
   for (ScheduledNode *v : nodes->getVertices()) {
@@ -1158,7 +1177,7 @@ inline auto operator<<(llvm::raw_ostream &os,
   os << "\nLoopBlock Edges:";
   for (ScheduledNode *inNode : nodes->getVertices()) {
     poly::AffineSchedule sin = inNode->getSchedule();
-    for (Dependence edge : nodes->outputEdges(deps)) {
+    for (Dependence edge : nodes->outputEdges(*deps)) {
       os << "\n\n\tEdge = " << edge;
       ScheduledNode *outNode = edge.output()->getNode();
       os << "Schedule In: s.getPhi() =" << sin.getPhi()
diff --git a/include/LinearProgramming/ScheduledNode.hpp b/include/LinearProgramming/ScheduledNode.hpp
index 9f89d9c63..c851f0d24 100644
--- a/include/LinearProgramming/ScheduledNode.hpp
+++ b/include/LinearProgramming/ScheduledNode.hpp
@@ -8,6 +8,7 @@
 #include "Utilities/ListRanges.hpp"
 #include <Utilities/Invariant.hpp>
 #include <Utilities/Valid.hpp>
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <ranges>
@@ -19,7 +20,7 @@ using math::PtrVector, math::MutPtrVector, math::DensePtrMatrix,
   math::MutDensePtrMatrix, math::SquarePtrMatrix, math::MutSquarePtrMatrix,
   math::end, math::last, math::_, math::Simplex;
 using poly::Dependence, poly::DepPoly;
-using utils::NotNull, utils::invariant, utils::Optional, utils::Arena;
+using utils::Valid, utils::invariant, utils::Optional, alloc::Arena;
 
 /// ScheduledNode
 /// Represents a set of memory accesses that are optimized together in the LP.
@@ -32,8 +33,8 @@ using utils::NotNull, utils::invariant, utils::Optional, utils::Arena;
 ///
 class ScheduledNode {
 
-  NotNull<Addr> store; // linked list to loads, iterate over getChild
-  NotNull<poly::Loop> loopNest;
+  Valid<Addr> store; // linked list to loads, iterate over getChild
+  Valid<poly::Loop> loopNest;
   ScheduledNode *next{nullptr};
   ScheduledNode *component{nullptr}; // SCC cycle, or last node in a chain
   // Dependence *dep{nullptr};          // input edges (points to parents)
@@ -62,8 +63,8 @@ class ScheduledNode {
     auto L = getNumLoops();
     return L * L;
   }
-  constexpr ScheduledNode(Addr *store, poly::Loop *L)
-    : store(store), loopNest(L) {
+  constexpr ScheduledNode(Addr *write, poly::Loop *L)
+    : store(write), loopNest(L) {
     mem[0] = L->getNumLoops();
     getFusionOmega() << 0;
   }
@@ -122,10 +123,10 @@ class ScheduledNode {
   }
   constexpr void setOffsets(int64_t *o) { offsets = o; }
   struct NextAddr {
-    constexpr auto operator()(Addr *a) const -> Addr * {
+    auto operator()(Addr *a) const -> Addr * {
       return llvm::cast_or_null<Addr>(a->getNext());
     }
-    constexpr auto operator()(const Addr *a) const -> const Addr * {
+    auto operator()(const Addr *a) const -> const Addr * {
       return llvm::cast_or_null<Addr>(a->getNext());
     }
   };
@@ -192,12 +193,12 @@ class ScheduledNode {
     }
   };
   template <bool Out> struct Deps {
-    poly::Dependencies dep;
+    const poly::Dependencies *dep;
 
     constexpr auto operator()(int32_t id) const {
       if constexpr (Out)
-        return dep.outputEdgeIDs(id) | std::views::transform(OutNode{dep});
-      else return dep.inputEdgeIDs(id) | std::views::transform(InNode{dep});
+        return dep->outputEdgeIDs(id) | std::views::transform(OutNode{dep});
+      else return dep->inputEdgeIDs(id) | std::views::transform(InNode{dep});
     }
     constexpr auto operator()(IR::Addr *a) const {
       if constexpr (Out) return (*this)(a->getEdgeOut());
@@ -205,11 +206,11 @@ class ScheduledNode {
     }
   };
   template <bool Out> struct DepIDs {
-    poly::Dependencies dep;
+    const poly::Dependencies *dep;
 
     constexpr auto operator()(int32_t id) const {
-      if constexpr (Out) return dep.outputEdgeIDs(id);
-      else return dep.inputEdgeIDs(id);
+      if constexpr (Out) return dep->outputEdgeIDs(id);
+      else return dep->inputEdgeIDs(id);
     }
     constexpr auto operator()(IR::Addr *a) const {
       if constexpr (Out) return (*this)(a->getEdgeOut());
@@ -217,15 +218,15 @@ class ScheduledNode {
     }
   };
   template <bool Out> struct DepFilter {
-    poly::Dependencies dep;
+    const poly::Dependencies *dep;
     unsigned depth;
 
     constexpr auto operator()(int32_t id) const {
       if constexpr (Out)
-        return dep.outputEdgeIDs(id) | dep.activeFilter(depth) |
+        return dep->outputEdgeIDs(id) | dep->activeFilter(depth) |
                std::views::transform(OutNode{dep});
       else
-        return dep.inputEdgeIDs(id) | dep.activeFilter(depth) |
+        return dep->inputEdgeIDs(id) | dep->activeFilter(depth) |
                std::views::transform(InNode{dep});
     }
     constexpr auto operator()(IR::Addr *a) const {
@@ -236,31 +237,31 @@ class ScheduledNode {
 
   // all nodes that are memory inputs to this one; i.e. all parents
   // NOTE: we may reach each node multiple times
-  [[nodiscard]] inline auto inNeighbors(IR::Dependencies dep) {
+  [[nodiscard]] inline auto inNeighbors(const IR::Dependencies &dep) {
     return utils::NestedList{utils::ListRange{store, NextAddr{}},
-                             Deps<false>{dep}};
+                             Deps<false>{&dep}};
   }
   // all nodes that are memory inputs to this one; i.e. all parents
   // NOTE: we may reach each node multiple times
 
   // all nodes that are memory outputs of this one; i.e. all children
   // NOTE: we may reach each node multiple times
-  [[nodiscard]] inline auto outNeighbors(IR::Dependencies dep) {
+  [[nodiscard]] inline auto outNeighbors(const IR::Dependencies &dep) {
     return utils::NestedList{utils::ListRange{store, NextAddr{}},
-                             Deps<true>{dep}};
+                             Deps<true>{&dep}};
   }
-  [[nodiscard]] inline auto inputEdgeIds(IR::Dependencies dep) const {
+  [[nodiscard]] inline auto inputEdgeIds(const IR::Dependencies &dep) const {
     return utils::NestedList{utils::ListRange{store, NextAddr{}},
-                             DepIDs<false>{dep}};
+                             DepIDs<false>{&dep}};
   }
-  [[nodiscard]] inline auto outputEdgeIds(IR::Dependencies dep) const {
+  [[nodiscard]] inline auto outputEdgeIds(const IR::Dependencies &dep) const {
     return utils::NestedList{utils::ListRange{store, NextAddr{}},
-                             DepIDs<true>{dep}};
+                             DepIDs<true>{&dep}};
   }
-  [[nodiscard]] inline auto inputEdgeIds(IR::Dependencies dep,
-                                         unsigned depth) const {
+  [[nodiscard]] inline auto inputEdgeIds(const IR::Dependencies &dep,
+                                         int depth) const {
     static_assert(std::forward_iterator<
-                  decltype(DepIDs<false>{dep}((IR::Addr *)nullptr).begin())>);
+                  decltype(DepIDs<false>{&dep}((IR::Addr *)nullptr).begin())>);
     static_assert(std::forward_iterator<decltype(utils::ListRange{
                     store, NextAddr{}}.begin())>);
     static_assert(std::forward_iterator<decltype(inputEdgeIds(dep).begin())>);
@@ -268,90 +269,89 @@ class ScheduledNode {
     return inputEdgeIds(dep) | dep.activeFilter(depth);
   }
   [[nodiscard]] inline auto outputEdgeIds(IR::Dependencies dep,
-                                          unsigned depth) const {
+                                          int depth) const {
     static_assert(std::forward_iterator<decltype(outputEdgeIds(dep).begin())>);
 
     static_assert(std::ranges::range<decltype(outputEdgeIds(dep))>);
     return outputEdgeIds(dep) | dep.activeFilter(depth);
   }
 
-  [[nodiscard]] inline auto inputEdges(IR::Dependencies dep) {
+  [[nodiscard]] inline auto inputEdges(const IR::Dependencies &dep) {
     return utils::NestedList{
       utils::ListRange{store, NextAddr{},
                        [](Addr *a) -> int32_t { return a->getEdgeIn(); }},
-      [=](int32_t id) {
+      [&](int32_t id) {
         return dep.inputEdgeIDs(id) |
-               std::views::transform([=](int32_t i) -> Dependence {
-                 IR::Dependencies d2 = dep;
-                 return d2.get(Dependence::ID{i});
+               std::views::transform([&](int32_t i) -> Dependence {
+                 return dep.get(Dependence::ID{i});
                });
       }};
   }
-  [[nodiscard]] inline auto outputEdges(IR::Dependencies dep) {
+  [[nodiscard]] inline auto outputEdges(const IR::Dependencies &dep) {
 
     return utils::NestedList{
       utils::ListRange{store, NextAddr{},
                        [](Addr *a) -> int32_t { return a->getEdgeOut(); }},
-      [=](int32_t id) {
+      [&](int32_t id) {
         return dep.outputEdgeIDs(id) |
-               std::views::transform([=](int32_t i) -> Dependence {
-                 IR::Dependencies d2 = dep;
-                 return d2.get(Dependence::ID{i});
+               std::views::transform([&](int32_t i) -> Dependence {
+                 return dep.get(Dependence::ID{i});
                });
       }};
   }
 
-  [[nodiscard]] inline auto inputEdges(IR::Dependencies dep, unsigned depth) {
+  [[nodiscard]] inline auto inputEdges(const IR::Dependencies &dep, int depth) {
     return utils::NestedList{
       utils::ListRange{store, NextAddr{},
                        [](Addr *a) -> int32_t { return a->getEdgeIn(); }},
-      [=](int32_t id) {
+      [&](int32_t id) {
         return dep.inputEdgeIDs(id) | dep.activeFilter(depth) |
-               std::views::transform([=](int32_t i) -> Dependence {
-                 IR::Dependencies d2 = dep;
-                 return d2.get(Dependence::ID{i});
+               std::views::transform([&](int32_t i) -> Dependence {
+                 return dep.get(Dependence::ID{i});
                });
       }};
   }
-  [[nodiscard]] inline auto outputEdges(IR::Dependencies dep, unsigned depth) {
+  [[nodiscard]] inline auto outputEdges(const IR::Dependencies &dep,
+                                        int depth) {
 
     return utils::NestedList{
       utils::ListRange{store, NextAddr{},
                        [](Addr *a) -> int32_t { return a->getEdgeOut(); }},
-      [=](int32_t id) {
+      [&](int32_t id) {
         return dep.outputEdgeIDs(id) | dep.activeFilter(depth) |
-               std::views::transform([=](int32_t i) -> Dependence {
-                 IR::Dependencies d2 = dep;
-                 return d2.get(Dependence::ID{i});
+               std::views::transform([&](int32_t i) -> Dependence {
+                 return dep.get(Dependence::ID{i});
                });
       }};
   }
 
   struct InNode {
-    poly::Dependencies dep;
-    constexpr auto operator()(int32_t id) -> ScheduledNode * {
-      return dep.input(Dependence::ID{id})->getNode();
+    const poly::Dependencies *dep;
+    constexpr auto operator()(int32_t id) const -> ScheduledNode * {
+      return dep->input(Dependence::ID{id})->getNode();
     }
   };
   struct OutNode {
-    poly::Dependencies dep;
-    constexpr auto operator()(int32_t id) -> ScheduledNode * {
-      return dep.output(Dependence::ID{id})->getNode();
+    const poly::Dependencies *dep;
+    constexpr auto operator()(int32_t id) const -> ScheduledNode * {
+      return dep->output(Dependence::ID{id})->getNode();
     }
   };
-  [[nodiscard]] inline auto outNeighbors(IR::Dependencies dep, unsigned depth) {
+  [[nodiscard]] inline auto outNeighbors(const IR::Dependencies &dep,
+                                         unsigned depth) {
     return utils::NestedList{
       utils::ListRange{store, NextAddr{}, GetEdge<true>{}},
-      DepFilter<true>{dep, depth}};
+      DepFilter<true>{&dep, depth}};
   }
-  [[nodiscard]] inline auto inNeighbors(IR::Dependencies dep, unsigned depth) {
+  [[nodiscard]] inline auto inNeighbors(const IR::Dependencies &dep,
+                                        unsigned depth) {
     return utils::NestedList{
       utils::ListRange{store, NextAddr{}, GetEdge<false>{}},
-      DepFilter<false>{dep, depth}};
+      DepFilter<false>{&dep, depth}};
   }
-  [[nodiscard]] inline auto hasActiveEdges(IR::Dependencies dep,
+  [[nodiscard]] inline auto hasActiveEdges(const IR::Dependencies &dep,
                                            unsigned depth) const -> bool {
-    const auto f = [=](int32_t d) {
+    const auto f = [&](int32_t d) {
       return !dep.isSat(Dependence::ID{d}, depth);
     };
     return std::ranges::any_of(inputEdgeIds(dep), f) ||
@@ -361,8 +361,7 @@ class ScheduledNode {
   [[nodiscard]] constexpr auto getSchedule() -> poly::AffineSchedule {
     return {mem};
   }
-  [[nodiscard]] constexpr auto getLoopNest() const
-    -> NotNull<const poly::Loop> {
+  [[nodiscard]] constexpr auto getLoopNest() const -> poly::Loop * {
     return loopNest;
   }
 
@@ -373,8 +372,8 @@ class ScheduledNode {
   // [[nodiscard]] constexpr auto wasVisited2() const -> bool { return visited2;
   // } constexpr void visit2() { visited2 = true; } constexpr void unVisit2() {
   // visited2 = false; }
-  [[nodiscard]] constexpr auto getNumLoops() const -> unsigned {
-    return unsigned(mem[0]);
+  [[nodiscard]] constexpr auto getNumLoops() const -> ptrdiff_t {
+    return mem[0];
   }
   // 'phiIsScheduled()` means that `phi`'s schedule has been
   // set for the outer `rank` loops.
@@ -397,21 +396,23 @@ class ScheduledNode {
     -> math::Range<ptrdiff_t, ptrdiff_t> {
     return _(phiOffset, phiOffset + getNumLoops());
   }
+  /// numLoops x numLoops
   // NOLINTNEXTLINE(readability-make-member-function-const)
   [[nodiscard]] constexpr auto getPhi() -> MutSquarePtrMatrix<int64_t> {
-    return {mem + 1, math::SquareDims{unsigned(getNumLoops())}};
+    return {mem + 1, math::SquareDims<>{unsigned(getNumLoops())}};
   }
+  /// numLoops x numLoops
   [[nodiscard]] constexpr auto getPhi() const -> SquarePtrMatrix<int64_t> {
-    return {const_cast<int64_t *>(mem) + 1, math::SquareDims{getNumLoops()}};
+    return {const_cast<int64_t *>(mem) + 1, math::SquareDims<>{getNumLoops()}};
   }
   /// getSchedule, loops are always indexed from outer to inner
   [[nodiscard]] constexpr auto getSchedule(ptrdiff_t d) const
     -> PtrVector<int64_t> {
-    return getPhi()(d, _);
+    return getPhi()[d, _];
   }
   [[nodiscard]] constexpr auto getSchedule(ptrdiff_t d)
     -> MutPtrVector<int64_t> {
-    return getPhi()(d, _);
+    return getPhi()[d, _];
   }
   [[nodiscard]] constexpr auto getFusionOmega(ptrdiff_t i) const -> int64_t {
     return (mem + 1)[getNumLoopsSquared() + i];
@@ -453,8 +454,8 @@ class ScheduledNode {
     MutSquarePtrMatrix<int64_t> phi = getPhi();
     ptrdiff_t indR = ptrdiff_t(indMat.numCol());
     for (ptrdiff_t i = 0; i < r; ++i) {
-      phi(i, _(0, indR)) << indMat(i, _);
-      phi(i, _(indR, end)) << 0;
+      phi[i, _(0, indR)] << indMat[i, _];
+      phi[i, _(indR, end)] << 0;
     }
     rank = r;
   }
@@ -463,10 +464,10 @@ class ScheduledNode {
     return omegaOffset;
   }
   void resetPhiOffset() { phiOffset = std::numeric_limits<unsigned>::max(); }
-  [[nodiscard]] constexpr auto calcGraphMaxDepth() const -> unsigned {
-    unsigned maxDepth = 0;
+  [[nodiscard]] constexpr auto calcGraphMaxDepth() const -> int {
+    int maxDepth = 0;
     for (const ScheduledNode *n : getVertices())
-      maxDepth = std::max(maxDepth, n->getNumLoops());
+      maxDepth = std::max(maxDepth, int(n->getNumLoops()));
     return maxDepth;
   }
   friend inline auto operator<<(llvm::raw_ostream &os,
@@ -481,12 +482,13 @@ static_assert(std::is_trivially_destructible_v<ScheduledNode>);
 static_assert(sizeof(ScheduledNode) <= 64); // fits in cache line
 
 class ScheduleGraph {
-  IR::Dependencies deps;
-  unsigned depth;
+  const IR::Dependencies &deps;
+  unsigned depth_;
 
 public:
   using VertexType = ScheduledNode;
-  constexpr ScheduleGraph(unsigned depth) : depth(depth) {}
+  constexpr ScheduleGraph(const IR::Dependencies &deps_, unsigned depth)
+    : deps(deps_), depth_(depth) {}
 
   [[nodiscard]] static constexpr auto getVertices(ScheduledNode *nodes)
     -> utils::ListRange<ScheduledNode, utils::GetNext, utils::Identity> {
@@ -496,11 +498,11 @@ class ScheduleGraph {
     -> utils::ListRange<const ScheduledNode, utils::GetNext, utils::Identity> {
     return static_cast<const ScheduledNode *>(nodes)->getVertices();
   }
-  [[nodiscard]] constexpr auto outNeighbors(ScheduledNode *v) const {
-    return v->outNeighbors(deps, depth);
+  [[nodiscard]] auto outNeighbors(ScheduledNode *v) const {
+    return v->outNeighbors(deps, depth_);
   }
-  [[nodiscard]] constexpr auto inNeighbors(ScheduledNode *v) const {
-    return v->inNeighbors(deps, depth);
+  [[nodiscard]] auto inNeighbors(ScheduledNode *v) const {
+    return v->inNeighbors(deps, depth_);
   }
 };
 
@@ -508,10 +510,14 @@ class ScheduleGraph {
 
 namespace graph {
 // static_assert(AbstractPtrGraph<lp::ScheduledNode>);
-static_assert(std::forward_iterator<
-              decltype(lp::ScheduleGraph{0}.outNeighbors(nullptr).begin())>);
-static_assert(std::forward_iterator<
-              decltype(lp::ScheduleGraph{0}.inNeighbors(nullptr).begin())>);
+static_assert(std::forward_iterator<decltype(lp::ScheduleGraph{
+                std::declval<const IR::Dependencies &>(), 0}
+                                               .outNeighbors(nullptr)
+                                               .begin())>);
+static_assert(std::forward_iterator<decltype(lp::ScheduleGraph{
+                std::declval<const IR::Dependencies &>(), 0}
+                                               .inNeighbors(nullptr)
+                                               .begin())>);
 static_assert(AbstractPtrGraph<lp::ScheduleGraph>);
 } // namespace graph
 } // namespace poly
diff --git a/include/Optimize/CostFunction.hpp b/include/Optimize/CostFunction.hpp
new file mode 100644
index 000000000..7891ab6be
--- /dev/null
+++ b/include/Optimize/CostFunction.hpp
@@ -0,0 +1,798 @@
+#pragma once
+
+#include "IR/Address.hpp"
+#include "IR/Instruction.hpp"
+#include "IR/Node.hpp"
+#include "IR/OrthogonalAxes.hpp"
+#include "Optimize/CostModeling.hpp"
+#include "Optimize/Legality.hpp"
+#include "Optimize/RegisterFile.hpp"
+#include "Polyhedra/Dependence.hpp"
+#include <Containers/BitSets.hpp>
+#include <Containers/Pair.hpp>
+#include <Math/Array.hpp>
+#include <Math/Constructors.hpp>
+#include <Math/Exp.hpp>
+#include <Math/GreatestCommonDivisor.hpp>
+#include <Math/Math.hpp>
+#include <Math/Matrix.hpp>
+#include <cstdint>
+#include <limits>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/Support/InstructionCost.h>
+namespace poly::CostModeling {
+using containers::Pair;
+using math::AbstractVector, math::AbstractMatrix, math::DensePtrMatrix, math::_;
+using utils::Optional;
+
+/// POD. Gives counts for the different kinds of costs.
+/// Fields:
+/// `int16_t trip_count`- we're unlikely to change decisions for >32k
+///         negative indicates compile-time known size.
+/// `uint16_t memory` number of mem sets.
+/// `bool exit` loop exit/entry.
+/// `uint31_t compute` number of compute sets.
+/// These give us info for iterating over the costs associated with a loop.
+/// for (i : I){
+///   for (j : J){
+///     for (k : K){ // leaf
+///       ...
+///     }
+///     for (k : K){ // leaf
+///       ...
+///     }
+///   }
+///   for (j : J){ // leaf
+///     ...
+///   }
+/// }
+/// For leaves, we compute latency as well as register cost.
+/// Note that we compute all costs at the header for a given depth,
+/// thus we only need headers and num-pops.
+struct LoopCostCounts {
+  uint16_t known_trip : 1;
+  uint16_t trip_count : 15;
+  uint16_t compute;
+  uint16_t omemory;
+  uint8_t cmemory;
+  uint8_t exit : 5;              /// how many blocks we exit after this
+  uint8_t l2vectorWidth : 3 {0}; // 1<<7 == 128
+};
+static_assert(sizeof(LoopCostCounts) == 8);
+
+/// Order is outermost -> innermost
+struct VectorizationFactor {
+  uint32_t l2factor;
+  // trailing bit is outermost loop, so if iterating by shifting,
+  // we go outer->inner
+  uint32_t indexMask;
+  constexpr operator IR::VectorWidth() const {
+    return IR::VectorWidth{unsigned(1) << l2factor, l2factor};
+  }
+};
+
+/// TODO: maybe two `uint8_t`s + `uint16_t`
+/// We only get up to 16 dimensions, but that is already excessive
+/// One `uint8_t` gives contig axis, the other the index into
+/// the memory cost kind. Thus, the struct could differentiate
+/// loads vs stores by itself, while also differentiating
+/// between eltypes.
+/// Another option is to store individual `MemoryCosts`,
+/// so that we can aggregate/sum up.
+struct MemCostSummary {
+  IR::Addr::Costs memcost;
+  OrthogonalAxes orth;
+  // [[nodiscard]] constexpr auto contigAxis() const -> uint32_t {
+  //   return data & 0xff;
+  // }
+  // mask containing `0` for dependent axes, 1s for independent
+  // should contain `0` for all non-existent loops, e.g.
+  // for (i = I, j = J, k = K, l = L) {
+  //   A[j,l]
+  //   for (a = A, b = B){ .... }
+  // }
+  // The mask should equal (1<<0) | (1<<2)  (for the i and k).
+  // Only loops it is nested in that it doesn't depend on count.
+  // [[nodiscard]] constexpr auto indepAxes() const -> uint32_t {
+  //   return data >> 8;
+  // };
+};
+constexpr auto operator&(MemCostSummary a, MemCostSummary b) -> uint32_t {
+  return a.orth.indep & b.orth.indep;
+}
+/// Basic idea is that costs are divided by loops they do not depend on
+///
+constexpr auto cost(const AbstractMatrix auto &invunrolls, uint32_t indepAxes)
+  -> utils::eltype_t<decltype(invunrolls)> {
+  if (!indepAxes) return 1;
+  uint32_t tz = std::countr_zero(indepAxes);
+  utils::eltype_t<decltype(invunrolls)> c{invunrolls[0, tz++]};
+  for (uint32_t d = indepAxes >> tz, i = tz; d; d >>= tz, i += tz) {
+    tz = std::countr_zero(d);
+    c *= invunrolls[0, i + tz++];
+  }
+  return c;
+}
+// costs is an array of length two.
+// memory costs, unnormalized by `prod(unrolls)`
+// `invunrolls` is a matrix, row-0 are the inverse unrolls, row-1 unrolls.
+constexpr auto cost(const AbstractMatrix auto &invunrolls, MemCostSummary mcs,
+                    VectorizationFactor vfi)
+  -> utils::eltype_t<decltype(invunrolls)> {
+  auto [mc, orth] = mcs;
+  using T = utils::eltype_t<decltype(invunrolls)>;
+  T c{cost(invunrolls, orth.indep)};
+  if (!(orth.indep & vfi.indexMask)) {
+    // depends on vectorized index
+    if (vfi.indexMask & orth.contig) {
+      c *= mc.contiguous;
+    } else if (!orth.contig) {
+      c *= mc.discontiguous;
+    } else {
+      // Discontiguous vector load.
+      // We consider two alternatives:
+      // 1. gather/scatter (discontiguous)
+      // 2. contiguous load for each vectorization factor of length equal to
+      // unroll, followed by shuffles.
+      // E.g., unroll contig by 4, another dim is vectorized by 8:
+      // we'd have 8 vloads (max(4/8,1) * 8), followed by 4*log2(8) shuffles.
+      // w_0 = [0,  8, 16, 24]
+      // w_1 = [1,  9, 17, 25]
+      // w_2 = [2, 10, 18, 26]
+      // w_3 = [3, 11, 19, 27]
+      // w_4 = [4, 12, 20, 28]
+      // w_5 = [5, 13, 21, 29]
+      // w_6 = [6, 14, 22, 30]
+      // w_7 = [7, 15, 23, 31]
+      //
+      // x_0 = [0,  8, 16, 24, 4, 12, 20, 28]
+      // x_1 = [1,  9, 17, 25, 5, 13, 21, 29]
+      // x_2 = [2, 10, 18, 26, 6, 14, 22, 30]
+      // x_3 = [3, 11, 19, 27, 7, 15, 23, 31]
+      //
+      // y_0 = [0,  1, 16, 17, 4, 5, 20, 21]
+      // y_1 = [8,  9, 24, 25, 12, 13, 28, 29]
+      // y_2 = [2, 3, 18, 19, 6, 7, 22, 23]
+      // y_3 = [10, 11, 26, 27, 14, 15, 30, 31]
+      //
+      // z_0 = [0,  1, 2, 3, 4, 5, 6, 7]
+      // z_1 = [8,  9, 10, 11, 12, 13, 14, 15]
+      // z_2 = [16, 17, 18, 19, 20, 21, 22, 23]
+      // z_3 = [24, 25, 26, 27, 28, 29, 30, 31]
+      //
+      // Or, if we unroll contig by 8, and another dim is vectorzeed by 2, we'd
+      // have 8 = (max(8/2,1) * 2) vloads, 8*log2(2)
+      // shuffles.
+      // w_0_0 = [0, 2]
+      // w_0_1 = [4, 6]
+      // w_0_2 = [8, 10]
+      // w_0_3 = [12, 14]
+      // w_1_0 = [1, 3]
+      // w_1_1 = [5, 7]
+      // w_1_2 = [9, 11]
+      // w_1_3 = [13, 15]
+      //
+      // z_0 = [0, 1]
+      // z_1 = [2, 3]
+      // z_2 = [4, 5]
+      // z_3 = [6, 7]
+      // z_4 = [8, 9]
+      // z_5 = [10, 11]
+      // z_6 = [12, 13]
+      // z_7 = [14, 15]
+      // Earlier, I had another term, `4*log2(max(8/4,1)) `8*log2(max(2/8,1))`
+      // i.e. u*log2(max(v/u,1))
+      // but I think we can avoid this by always working with vectors that are
+      // the larger of `u` and `v`, inserting at the start or extracting at the
+      // end, whichever is necessary.
+      // We divide by `u[contig]`, as it is now accounted for
+      // So we have
+      // v*max(u/v, 1) + u*log2(v)
+      T iu{invunrolls[0, orth.contig]}, u{invunrolls[1, orth.contig]};
+      utils::invariant(iu == 1 / u);
+      // FIXME: memory and shuffle cost should be separate?
+      c *= math::smin(mc.contiguous * math::smax(u, (1 << vfi.l2factor) * iu) +
+                        u * vfi.l2factor,
+                      mc.discontiguous);
+    }
+  } else c *= mc.scalar;
+  return c;
+}
+/// General fallback method for those without easy to represent structure
+/// inds is an `IR::Address->indexMatrix()`, thus it is `arrayDim() x
+/// getNumLoops()`
+/// Non-standard structure here means that we have at least one loop with
+/// more than one array dimension.
+/// For these, we use the incorrect formula:
+///
+constexpr auto cost(const AbstractMatrix auto &invunrolls, MemCostSummary orth,
+                    VectorizationFactor vfi, DensePtrMatrix<int64_t> inds)
+  -> utils::eltype_t<decltype(invunrolls)> {
+  using T = utils::eltype_t<decltype(invunrolls)>;
+  T c{1};
+  auto [arrayDim, numLoops] = shape(inds);
+  utils::invariant(numLoops > 0);
+  utils::invariant(arrayDim > 0);
+  utils::invariant(arrayDim <= 64);
+  utils::invariant(invunrolls.numCol(), inds.numCol());
+  for (ptrdiff_t d = 0; d < arrayDim; ++d) {
+    int64_t g = 0;
+    containers::BitSet64 bs;
+    T uprod;
+    for (ptrdiff_t l = 0; l < numLoops; ++l) {
+      if ((uint32_t(1) << l) == vfi.indexMask) continue;
+      int64_t a = inds[d, l];
+      if (!a) continue;
+      bool docontinue{false};
+      // We only
+      for (ptrdiff_t k = 0; k < arrayDim; ++k) {
+        if ((k == d) || (!inds[k, l])) continue;
+        docontinue = (inds[d, _] != inds[k, _]) || (d > k);
+        if (docontinue) break;
+      }
+      if (docontinue) continue;
+      if (bs.empty()) {
+        g = a;
+        uprod = invunrolls[0, l];
+      } else {
+        g = math::gcd(g, a);
+        uprod *= invunrolls[0, l];
+      }
+      bs.insert(l);
+    };
+    if (bs.size() < 2) continue;
+    T prod{1};
+    for (ptrdiff_t l : bs) {
+      if ((uint32_t(1) << l) == vfi.indexMask) continue;
+      int64_t a = inds[d, l];
+      if (!a) continue;
+      prod *= (1 - (a / g) * (uprod / invunrolls[0, l]));
+    }
+    c *= (1 - prod);
+  }
+  // c is a scaling factor; now we proceed to calculate cost similaly to the
+  // orth-axis implementation above.
+  return c * cost(invunrolls, orth, vfi);
+}
+
+// We need to define an unroll ordering.
+struct RegisterUseByUnroll {
+  math::Vector<std::array<uint32_t, 2>> masks{}; // coef, mask pairs
+  unsigned register_count;                       // includes constant offset
+  [[nodiscard]] constexpr auto begin() const
+    -> const std::array<uint32_t, 2> * {
+    return masks.begin();
+  }
+  [[nodiscard]] constexpr auto end() const -> const std::array<uint32_t, 2> * {
+    return masks.end();
+  }
+};
+// TODO: define function to implement register_count
+constexpr auto registerPressure(const AbstractMatrix auto &invunrolls,
+                                const RegisterUseByUnroll &r)
+  -> utils::eltype_t<decltype(invunrolls)> {
+  utils::eltype_t<decltype(invunrolls)> acc{0};
+  for (auto [c, m] : r) {
+    utils::eltype_t<decltype(invunrolls)> t{1};
+    containers::BitSet64 bs{std::array<uint64_t, 1>{m}};
+    for (ptrdiff_t i : bs) t *= invunrolls[1, i];
+    acc += c * t;
+  }
+  // note the softplus(8x)/4, so 2x scaling on penalty representing
+  // the stack load+store combination.
+  return 0.25 * math::softplus(8.0 * (acc - r.register_count));
+}
+
+inline auto registerUse(const llvm::TargetTransformInfo &TTI,
+                        LoopDepSatisfaction deps, IR::Loop *L)
+  -> RegisterUseByUnroll {
+  RegisterUseByUnroll u;
+  // Ideally, we'd have the transitive closure of depencencies, or better yet
+  // top-sorted IDs for quick checks on relative order w/ respect to the current
+  // top-sorting.
+  // E.g. ID_x < ID_y proves it is legal for ID_x to be first, but does not
+  // prove the opposite is illegal. The weak proof may often be enough. "better
+  // yet" is because the check is very efficient, not because it is powerful.
+  // For a somewhat-efficient check of the former variety, we'd probably want
+  // to use `BitSet`s + canonical ID-values (not position-based) for each
+  // `Value`, which is something we could do if switching to a more
+  // data-oriented design.
+  // The simple top-index check is enough for checking if something is infront,
+  // behind, or within a loop.
+  //
+  // We scan dependencies, looking for reduction latencies
+  // can we use `TTI.getRegUsageForType()`?
+  // TargetTransformInfoImplBase defaults to `1`, and some backends like x86
+  // do not override it, so it is not something we can rely on.
+  //
+  // d.input()->reductionLatency() > 0 indicates a dependence is live across the
+  // loop otherwise, we only consider instructions within the loop?
+  // ...or...perhaps we only need to consider the loop leaf's instructions; we
+  // can see what is written and referenced.
+  // For now, will try and follow the latter approach.
+  for (IR::Value *v : L->nodes()) {
+    // use `v`
+  }
+  return u;
+}
+
+inline auto
+memcosts(const AbstractMatrix auto &invunrolls, VectorizationFactor vf,
+         math::PtrVector<Pair<MemCostSummary, IR::Addr::Costs>> orth_axes) {
+  utils::eltype_t<decltype(invunrolls)> ic{};
+  for (auto [oa, mc] : orth_axes) ic += cost(invunrolls, oa, mc, vf);
+  return ic;
+}
+inline auto
+memcosts(const AbstractMatrix auto &invunrolls, VectorizationFactor vf,
+         math::PtrVector<std::tuple<MemCostSummary, DensePtrMatrix<int64_t>>>
+           orth_axes) {
+  utils::eltype_t<decltype(invunrolls)> ic{};
+  for (auto [oa, inds] : orth_axes) ic += cost(invunrolls, oa, vf, inds);
+  return ic;
+}
+inline auto compcosts(const AbstractMatrix auto &invunrolls,
+                      math::PtrVector<std::array<uint32_t, 2>> compindep) {
+  utils::eltype_t<decltype(invunrolls)> cc{};
+  for (auto [oa, sf] : compindep) cc += cost(invunrolls, oa) * sf;
+  return cc;
+}
+
+// We then additionally need a throughput vs latency estimator, and code for
+// handling the tail.
+// Standard throughput is fairly trivial/should be a vector sum,
+// although we may have some operations not dependent on all loops,
+// in which case unrolling the loops they don't depend on will help.
+// Thus, it would probably be best to handle these with code
+// similar to the memory cost-fun above, ideally we can abstract away the core.
+//
+/// memcost = I*J*(Ui*Uj*C_{Al} + Uj*C_{yl}) / (Ui*Uj) +
+///    I*(C_{xl}*Ui + C_{xs}*Ui) / Ui
+/// cthroughput = I*J*(Ui*Uj*C_{t,fma}) / (Ui*Uj) + I*(Ui*C_{t,add}*(Uj-1)) / Ui
+/// Ui clatency = I*J*C_{l,fma}/smin(Ui*Uj, C_{l,fma}/C_{t,fma}) +
+///    I*C_{l,add}*log2(Uj)
+///
+/// Here, we define a cost fn that can be optimized to produce
+///
+/// vectorization and unrolling factors.
+/// We assemble all addrs into a vector, sorted by depth first traversal order
+/// of the loop tree, e.g.
+/// A(0) --> B(1) --> C(2) --> D(3)
+///      \-> E(5) --> F(6) \-> G(4)
+///      \-> H(7) --> I(8) --> J(9)
+/// Focusing only on memory addresses initially...
+/// The cost of a particular read/write can be looked up from LLVM
+/// as a function of scalar/gather/scatter/broadcast/contiguous.
+/// Then this can be adjusted by the product of all unroll factors of loops
+/// it depends on, divided by the product of all unroll factors of all
+/// containing loops.
+/// To optimize, we can branch and bound. Unrolling factors lead to a natural
+/// relaxation that plays well, but less so for binary variables like which
+/// loop is vectorized. Additionally, patterns such as replacing
+/// gather/scatters with shuffle sequences need special handling, that
+/// restricts the branch and bound to powers of 2. To be able to build such a
+/// cost model, we need to estimate the number of live variables as a result
+/// of unroll factors, in order to impose constraints.
+///
+/// We use soft constraints for register pressuring, representing the
+/// store/reload pair of a spill.
+///
+/// Furthermore, we also need to consider the possibility of dependency
+/// chains. Consider, for example
+/// ```
+/// for (ptrdiff_t i = 0; i < I; ++i){
+///   eltype_t<A> xi = x[i];
+///   for (ptrdiff_t j = 0; j < J; ++j)
+///     xi += A[i][j] * y[j];
+///   x[i] = xi;
+/// }
+/// ```
+/// The `j` loop itself has a dependency chain.
+/// Two options for addressing this:
+/// 1. unrolling `j`, cloning the accumulation registers, and reducing at the
+/// end.
+/// 2. unrolling the `i` loop.
+/// The second option is better, but may not be possible, e.g. if there is no
+/// `i` loop or it carries some dependency. Thus, we want our model to unroll
+/// `i` when legal, and unroll `j` otherwise.
+/// Assuming a throughput of 2 fma/cycle and a latency of 4 cycles, an
+/// estimate of the cost as a function of I, J, Ui, and Uj is (ignoring
+/// vectorization): 4*I*J/min(Ui*Uj, 2*4) + 4*I*log2(Uj) The first term is
+/// latency per fma (because of the dependency chain) * the number of
+/// iterations, divided by however many unrolling allows us to have inflight.
+/// The second term is for the reduction of the cloned `Uj` accumulators. Each
+/// step in the reduction has a latency of 4 cycles, and we need to do
+/// `log2(Uj)` steps.
+///
+/// Note, `y-softplus(l*(y-x))/l` is a good smooth minimum function,
+/// monotonic in `x` and differentiable everywhere. `l` controls
+/// sharpness. Likewise, `y+softplus(l*(x-y))/l` for `max`.
+///
+/// Thus, a cost function for the above gemv could be something like
+/// memcost = I*J*(Ui*Uj*C_{Al} + Uj*C_{yl}) / (Ui*Uj) +
+///    I*(C_{xl}*Ui + C_{xs}*Ui) / Ui
+/// cthroughput = I*J*(Ui*Uj*C_{t,fma}) / (Ui*Uj) + I*(C_{t,add}*(Uj-1)) /
+/// Ui clatency = I*J*C_{l,fma}/smin(Ui*Uj, C_{l,fma}/C_{t,fma}) +
+///    I*C_{l,add}*log2(Uj)
+/// cost = memcost + smax(cthroughput, clatency)
+/// or, if the it is easier to solve:
+/// cost = memcost + cthroughput + clatency
+///
+/// We may initially want to add a small cost for loop increment and
+/// cmp/branch, to encourage unrolling more generally, plus a cost for
+/// unrolling to discourse any excess unrolling when it doesn't provide
+/// meaningful benefits (representing the general cost of code size/ filling
+/// uop cache -- we definitely want loops to fit in the uop cache of any CPU
+/// sporting one!!! ).
+///
+///
+///
+/// Note that if we had
+/// ```
+/// for (ptrdiff_t i = 0; i < I; ++i){
+///   eltype_t<A> yi = y[i];
+///   for (ptrdiff_t j = 0; j < J; ++j)
+///     x[j] += A[i][j] * yi;
+/// }
+/// ```
+/// then unrolling the `i` loop doesn't increase OOO (Out Of Order execution),
+/// but we can assume that as successive `j` iterations are independent/do not
+/// have a dependency chain, this isn't an issue. That is, we only consider
+/// reductions across the inner-most loop as requiring cloning of accumulators.
+///
+/// On throughput modeling, LLVM seems to generally give a recip throughput of
+/// 1 for pipelined instructions, regardless of number of ports. This is
+/// actually what we want, as this allows RTs to be additive (e.g., we may
+/// have a fma that is able to run on 2 ports (e.g. p0 or p5) and a permute
+/// that can only execute on one (e.g. p5); when mixing these instructions,
+/// they have the same effective cost -- they use a port -- and the more
+/// limited port choices of one isn't a problem so long as others can use what
+/// remains. For our purposes, it isn't worth getting too fancy here. It is
+/// worth noting that the baseline model presented here
+/// https://arxiv.org/pdf/2107.14210.pdf
+/// performed respectively well when compared to vastly more sophisticated
+/// tools; for example, it performed similarly well as llvm-mca on most tested
+/// architectures!
+/// The baseline model used above for loops was
+/// max(1, (n-1)/i, m_r/m, m_w/w)
+/// where
+/// n - the number of instructions in the benchmark (-1 because of assumption
+/// that the cmp and branch are macro-fused, meaning the last two instructions
+/// count as 1)
+/// m_r - number of memory reads
+/// m_w - number of memory writes
+/// i - the issue width, e.g. 4 for Intel Skylake CPUs.
+/// m - number of reads the CPU can do per cycle (2 for all in the article)
+/// w - number of writes the CPU can do per cycle (e.g. 2 for Ice Lake and
+/// newer, 1 for older) Unfortunately, we cannot get the CPU-specific
+/// information (`i`,`m`,or`w`) from LLVM. However, these are largely a matter
+/// of scale, and are generally correlated. E.g., Intel's Alderlake's values
+/// would be 6, 3, and 2, vs the older Skylake's 4, 2, and 1. While not all
+/// the ratios are equal (`w`'s is 2 instead of 1.5), it is unlikely that many
+/// optimization decisions are going to be made differently between them.
+/// A possible exception is that we may wish to unroll more for CPUs with more
+/// out of order execution abilities. `getMaxInterleaveFactor` is an indicator
+/// of whether the pipeline might be very narrow.
+///
+///
+/// Given `x[a*i + b*j]`, where neither `i` or `j` are vectorized (and `a` and
+/// `b` are compile time constants), we use:
+/// (a_g*U_i + b_g*U_j - a_g*b_g) / (U_i*U_j)
+/// = a_g/U_j + b_g/U_i - a_g*b_g / (U_i*U_j)
+/// = 1 - (1 - a_g/U_j ) * (1 - b_g/U_i)
+/// as the cost, where `a_g = abs(a/gcd(a,b))` and `b_g = abs(b/gcd(a,b))`.
+///
+/// For more, we generalize this pattern
+/// = 1 - \prod_{d}^{D}\left(1 - \frac{coef_{g,d}U_d}{\prod_{i}^{D}U_i}\right)
+///
+/// In the `D=3` case, this expands to
+/// 1 - (1 - a_g/(U_j*U_k))(1 - b_g/(U_i*U_k))(1 - c_g/(U_i*U_j))
+/// = 1 - (1 - c_g/(U_i*U_j))*
+///    (1 - a_g/(U_j*U_k) - b_g/(U_i*U_k)) + a_g*b_g/(U_i*U_j*U_k^2))
+/// = a_g/(U_j*U_k) + b_g/(U_i*U_k)) + c_g/(U_i*U_j) - a_g*b_g/(U_i*U_j*U_k^2))
+///     - a_g*c_g/(U_i*U_j^2*U_k) - b_g*c_g/(U_i^2*U_j*U_k))
+///     + a_g*b_g*c_g/(U_i^2*U_j^2*U_k^2))
+///
+/// TODO: check the degree of correctness...
+/// I kind of just made something up that looks sort of right.
+///
+/// For register consumption, we
+/// 1. Determine an ordering of unroll factors for each inner most loop.
+/// 2. Define a registers used as a function of these unroll factors.
+///
+/// Loads from inner unrolls that don't depend on any outer-unrolls must have
+/// lifetimes spanning all outer-unrolls, if they're re-used by an op
+/// depending on that outer. Our heuristic for ordering unrolls is based on
+/// the twin observations:
+/// 1. Inner unrolls are likely to consume more registers for longer.
+/// 2. More ops with overlapping lifetimes dependent on one particular loop
+/// require more registers.
+///
+/// As the ordering of unrolls influences register pressure, we sort them
+/// first by register cost per unroll (placing those with the highest register
+/// cost outside), and then by memory op cost within these categories, placing
+/// the highest costs innermost  (higher memory cost means lower unroll
+/// relative to the lower cost, so that we get more reuse on the higher cost
+/// operations; lower unroll means we place inside, reducing the cost of these
+/// unrolls).
+///
+/// So, how do we define register cost per unroll in an unroll-order
+/// independent manner, so that we can use this for determining the order?
+/// ```
+/// for (int m=0; m<M; ++m){
+///   for (int n=0; n<N; ++n){
+///     auto Cmn = C[m,n];
+///     for (int k=0; k<K; ++k)
+///       Cmn += A[m,k]*B[k,n];
+///     C[m,n] = Cmn;
+///   }
+/// }
+/// ```
+/// In this example, we have 4 ops in the inner loop
+/// A[m,k] --->*--> (Cmn +=)
+/// B[k,n] -/
+///
+/// Register Costs:
+/// Amk_rc = U_m * U_k // live until use
+/// Bkn_rc = U_k * U_n // live until use
+/// Cmn_rc = U_m * U_n // live until end of loop
+/// Memory Op Costs, m-vectorized (assuming column-major):
+/// Amk_rc = L_c * U_m * U_k
+/// Bkn_rc = L_b * U_k * U_n
+/// Cmn_rc = 0 * U_m * U_n
+/// L_c > L_b, so A-contiguous load should be interior to B-broadcast load.
+///
+/// As the cost function is evaluated many times, we try and move as much work
+/// to the setup as possible. Loop cost is thus divided into some structured
+/// components, and much of the interpreting work hoisted to a step defining a
+/// parameterization.
+/// Ideally, we would avoid repeating this work for different vectorization
+/// decisions. However, vectorization decisions may impact unroll ordering
+/// decisions.
+///
+///
+///
+/// ///
+class LoopTreeCostFn {
+  // counts per loop, indicating how many of each of the following three fields
+  math::Vector<LoopCostCounts> cost_counts{};
+  // orthogonal axes and costs
+  math::Vector<MemCostSummary> orth_axes{};
+  // non-orthogonal axes and costs
+  math::Vector<Pair<MemCostSummary, DensePtrMatrix<int64_t>>> conv_axes{};
+  // compute cost summary
+  math::Vector<Pair<float, uint32_t>> compute_independence{};
+  // for leaves, we need latency information
+  llvm::SmallVector<Pair<RegisterUseByUnroll, Pair<uint16_t, uint16_t>>>
+    leafs{};
+  unsigned maxVectorWidth;
+  ptrdiff_t max_depth{};
+
+  constexpr void clear() {
+    cost_counts.clear();
+    orth_axes.clear();
+    conv_axes.clear();
+    compute_independence.clear();
+    leafs.clear();
+    max_depth = 0;
+  }
+
+  // should only have to `init` once per `root`, with `VectorizationFactor`
+  // being adjustable.
+  // Note: we are dependent upon scanning in top order, so that operands'
+  // `calcLoopDepFlag()` are calculated before we get.
+  // TODO: vec factor should be a tree-flag
+  // Iteration order:
+  // We fully iterate over a loop before descending
+  // for (i : I){
+  //   // block 0
+  //   for (j : J){
+  //     // block 1
+  //   }
+  //   // block 2
+  //   for (j : J){
+  //     // block 3
+  //   }
+  //   // block 4
+  // }
+  // we'd iterate 0, 2, 4, 1, 3.
+  // This way we can store once we hit the end.
+  // If there are no subloops to iterate to after, then we store the exit count.
+  // If there are, then the exit-count is 0, forward '1+exit' count to the last
+  // sub-loop, and `1` to all previous sub-loops.
+  // It's thus natural to implement recursively.
+  // NOLINTNEXTLINE(misc-no-recursion)
+  void initLoop(LoopDepSatisfaction deps, IR::Loop *L, unsigned maxl2VF,
+                const llvm::TargetTransformInfo &TTI, ptrdiff_t depth,
+                unsigned exitCount) {
+    invariant(depth > 0);
+    ptrdiff_t compute = compute_independence.size(), omemory = orth_axes.size(),
+              cmemory = conv_axes.size();
+    unsigned maxVF = 1 << maxl2VF;
+    // Loop and push throughput costs
+    for (IR::Node *N = L->getChild(); N; N = N->getNext()) {
+      if (auto *A = llvm::dyn_cast<IR::Addr>(N)) {
+        OrthogonalAxes oa = A->calcOrthAxes(depth);
+        IR::Addr::Costs rtl = A->calcCostContigDiscontig(TTI, maxVF);
+        if (oa.indep_axes) {
+          // check for duplicate
+          bool found = false;
+          for (ptrdiff_t i = omemory; i < orth_axes.size(); ++i) {
+            if (orth_axes[i].orth != oa) continue;
+            found = true;
+            orth_axes[i].memcost += rtl;
+            break;
+          }
+          if (!found) orth_axes.emplace_back(rtl, oa);
+        } else {
+          bool found = false;
+          for (ptrdiff_t i = cmemory; i < conv_axes.size(); ++i) {
+            if (conv_axes[i].first.orth != oa) continue;
+            if (conv_axes[i].second != A->indexMatrix()) continue;
+            found = true;
+            conv_axes[i].first.memcost += rtl;
+            break;
+          }
+          if (!found)
+            conv_axes.emplace_back(MemCostSummary{rtl, oa}, A->indexMatrix());
+        }
+      } else if (auto *C = llvm::dyn_cast<IR::Compute>(N)) {
+        bool found = false;
+        uint32_t indep = C->calcLoopDepFlag(depth);
+        float cc{float(
+          C->getCost(TTI, IR::VectorWidth{maxVF, maxl2VF}).recipThroughput)};
+        for (ptrdiff_t i = compute; i < compute_independence.size(); ++i) {
+          if (compute_independence[i].second != indep) continue;
+          found = true;
+          compute_independence[i].first += cc;
+          break;
+        }
+        if (!found) compute_independence.emplace_back(cc, indep);
+      } // else if (auto *S = llvm::dyn_cast<IR::Loop>(N)) {
+    }
+    auto [known_trip, trip_count] = L->getAffineLoop()->tripCount(depth);
+    uint16_t compcnt = compute_independence.size() - compute,
+             omemcnt = orth_axes.size() - omemory,
+             cmemcnt = conv_axes.size() - cmemory;
+    IR::Loop *SL = L->getSubLoop();
+    cost_counts.emplace_back(known_trip, trip_count, compcnt, omemcnt, cmemcnt,
+                             SL ? 0 : exitCount);
+    if (SL) iterLoopLevel(deps, SL, maxl2VF, TTI, ++depth, exitCount);
+    else leafCosts(deps, L, maxl2VF, TTI);
+  }
+  void leafCosts(LoopDepSatisfaction deps, IR::Loop *L, unsigned maxl2VF,
+                 const llvm::TargetTransformInfo &TTI) {
+    // TODO: if (!SL) we're in a leaf, and need compute latency
+    // We use the `IROptimizer::loopDepSats` to check the depencencies held at
+    // the loop. We check these for those that look like reductions that are
+    // legal to reassociate (we check this earlier and set
+    // `in->reassociableReductionPair()==out`), e.g. integer add chains or
+    // floating point with the reassociate FMF set.
+    // When we have reductions, we have src->dst chains stored through
+    // `linkReductionDst()` that can be used for accumulating latencies.
+    // FIXME (maybe): Current implementation only allows each instruction to be
+    // a part of 1 chain.
+    // for (j in J){ // arbitrary number of outer
+    // loops
+    //   %w = %array[j...];
+    //   %x = foo(%w);
+    //   for (i in I){ // inner loop(s)
+    //     %y = bar(%x);
+    //   }
+    //   %z = quz(%y);
+    //   %array[j...] = %z;
+    // }
+    // Rather than using PhiNodes, we represent dependencies through addresses.
+    // we can get legality from the loop.
+    // The tricker thing to compute here is register pressure
+    llvm::InstructionCost::CostType latency{0};
+    for (poly::Dependence d : deps.depencencies(L)) {
+      // instruction latency can be a function of vector width
+      latency =
+        std::max(latency, d.input()->reductionLatency(TTI, maxVectorWidth));
+    }
+    CostModeling::Legality legality = L->getLegality();
+    uint16_t l = std::numeric_limits<uint16_t>::max();
+    if (l > latency) l = latency;
+    // for reg use, lets add register dep flag
+    // what kind of traversal would minimize width?
+    // breadth-first lets us retire early, but can increase
+    // live count?
+    // Note, every reduction must add register contribution.
+    leafs.emplace_back(registerUse(TTI, deps, L),
+                       Pair<uint16_t, uint16_t>{l, legality.numReductions()});
+    // for (IR::Node *N = L->getChild(); N; N = N->getNext()) {}
+    return;
+  };
+
+  // NOLINTNEXTLINE(misc-no-recursion)
+  void iterLoopLevel(LoopDepSatisfaction deps, IR::Loop *L, unsigned maxl2VF,
+                     const llvm::TargetTransformInfo &TTI, ptrdiff_t depth,
+                     unsigned exitCount) {
+    do {
+      IR::Loop *N = L->getNextLoop();
+      unsigned ec = N ? ++exitCount : 1;
+      initLoop(deps, N, maxl2VF, TTI, depth, ec);
+      L = N;
+    } while (L);
+  }
+
+public:
+  // this is a vector fun, where indexing may do non-trivial computation
+  // also, mapping from this vector to loop position isn't trivial either
+  // hence, we use a 2 x max_depth matrix that we copy into as we descend
+  // (and pop from as we ascend). Row `0` is for inverse values,
+  // and row `1` for direct values.
+  // Inverses are favored as our costs fns use them more often.
+  constexpr auto operator()(alloc::Arena<> alloc,
+                            const AbstractVector auto &x) const {
+    using T = utils::eltype_t<decltype(x)>;
+    utils::invariant(max_depth < 16);
+    // row 0: inverse unrolls
+    // row 1: unrolls
+    // row 2: cumprod invunroll
+    math::MutArray<T, math::DenseDims<3>> invunrolls{
+      math::matrix<T>(alloc, math::Row<3>{}, math::Col<>{max_depth})};
+    ptrdiff_t i = 0, depth = 0, mi = 0, mc = 0, ci = 0, li = 0;
+    double tripcounts[16];
+    VectorizationFactor vf{};
+    // we evaluate every iteration
+    T c{};
+    for (auto [comptimetrip, trip_count, compute, omem, cmem, exit, l2vw] :
+         cost_counts) {
+      if (l2vw) {
+        invariant(vf.l2factor == 0);
+        invariant(vf.indexMask == 0);
+        vf.l2factor = l2vw;
+        vf.indexMask = uint32_t(1) << depth;
+      }
+      invunrolls[1, depth] = x[i++];
+      invunrolls[2, depth] = invunrolls[0, depth] = 1 / invunrolls[1, depth];
+      if (depth) invunrolls[2, depth] *= invunrolls[2, depth - 1];
+      tripcounts[depth] =
+        (depth ? tripcounts[depth - 1] * trip_count : trip_count);
+      T cc{compcosts(invunrolls, compute_independence[_(0, compute) + ci])};
+      ci += compute;
+      if (exit) {
+        auto [reguse, lt] = leafs[li++];
+        auto [l, numreduct] = lt;
+        // we're now in a leaf, meaning we must consider register costs,
+        // as well as reduction costs and latency of reduction chains.
+        cc = smax(cc, l * invunrolls[2, depth]);
+        cc += registerPressure(invunrolls, reguse);
+        if (numreduct) {
+          cc +=
+            compcost(invunrolls, compute_independence[_(0, numreduct) + ci]) *
+            log2(invunrolls[1, depth]) / trip_count;
+          ci += numreduct;
+        }
+      }
+      cc += memcosts(invunrolls, vf, orth_axes[_(0, omem) + mi]);
+      mi += omem;
+      cc += memcosts(invunrolls, vf, conv_axes[_(0, cmem) + mc]);
+      mc += cmem;
+      c += tripcounts[depth] * cc;
+      // Decrement depth by `exit - 1`; the `-1` corresponds
+      // to descending into this header, while we exit `exit` loops afterwards.
+      depth -= exit - 1; // don't fuse `-1` to keep `exit` unsigned
+      if (depth <= std::countr_zero(vf.indexMask)) {
+        vf.l2factor = 0;
+        vf.indexMask = 0;
+      }
+    }
+    return c;
+  }
+  void init(LoopDepSatisfaction deps, IR::Loop *root, unsigned maxl2VF,
+            llvm::LLVMContext &C, const llvm::TargetTransformInfo &TTI) {
+    clear(); // max_depth = 0;
+    maxVectorWidth = RegisterFile::estimateMaximumVectorWidth(C, TTI);
+    iterLoopLevel(deps, root->getSubLoop(), maxl2VF, TTI, 0, 0);
+  }
+  LoopTreeCostFn(LoopDepSatisfaction deps, IR::Loop *root, unsigned maxVF,
+                 llvm::LLVMContext &C, const llvm::TargetTransformInfo &TTI)
+    : maxVectorWidth{unsigned(1) << maxVF} {
+    init(deps, root, maxVF, C, TTI);
+  }
+};
+
+} // namespace poly::CostModeling
diff --git a/include/Optimize/CostModeling.hpp b/include/Optimize/CostModeling.hpp
new file mode 100644
index 000000000..a8176ea4c
--- /dev/null
+++ b/include/Optimize/CostModeling.hpp
@@ -0,0 +1,948 @@
+#pragma once
+
+#include "Dicts/BumpMapSet.hpp"
+#include "Graphs/Graphs.hpp"
+#include "IR/Address.hpp"
+#include "LinearProgramming/LoopBlock.hpp"
+#include "LinearProgramming/ScheduledNode.hpp"
+#include "Optimize/Legality.hpp"
+#include "Polyhedra/Dependence.hpp"
+#include <Alloc/Arena.hpp>
+#include <Math/Array.hpp>
+#include <Math/Math.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/CaptureTracking.h>
+#include <llvm/Analysis/MemoryBuiltins.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Constant.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Support/Allocator.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/raw_ostream.h>
+namespace poly {
+namespace IR {
+/// If this is a store of a reassocialbe reduction, this sets the
+/// `reassociableReduction` field to the corresponding load, and that field of
+/// the load to `this` store.
+/// It requires `Addr` to have been sorted, so we check the first output edge of
+/// this store. If that edge is a load within the same loop, and has a time
+/// dependence, we check for a reassociable chain of compute operations
+/// connecting them. If such a chain, without any non-reassociable chains,
+/// exists, then we mark them as reassociable.
+/// Note, with sorting
+/// for (int i = 0; i < I; ++i)
+///   for (int j = 0; j < J; ++j)
+///     x[i] = x[i] + A[j,i] * y[j];
+///   x[i] = acc;
+///
+/// we have the store `x[i]` is the source for the `x[i]` load on a future
+/// `j` iteration.
+/// However, our IR would be optimized into:
+///
+/// for (int i = 0; i < I; ++i){
+///   acc = x[i];
+///   for (int j = 0; j < J; ++j)
+///     acc += A[j,i] * y[j];
+///   x[i] = acc;
+/// }
+///
+/// The same thing applies: `j` is the loop that satifies the dependency,
+/// but we hoisted the load/store pair out.
+/// This must be called after `sortEdges`, so that output edges of the store
+/// `x[i] = acc` are top sorted. The load `acc = x[i]` should be the very
+/// first output topologically -- afterall, it occus before the store!!
+/// TODO: does `Addr` hoisting handle this??
+/// Consider also the example:
+/// int64_t x[1]{};
+/// for (ptrdiff_t n = 0; n < N; ++n){
+///   x[0] = x[0] + y[n];
+///   z[n] = x[0];
+/// }
+/// this is harder to understand than, but behaves the same as
+/// z[0] = y[n];
+/// for (ptrdiff_t n = 1; n < N; ++n){
+///   z[n] = z[n-1] + y[n];
+/// }
+/// int64_t x[1]{z[N-1]};
+/// which does not have any reductions.
+/// This should be handled because, if we had a loop like
+/// int64_t x[1]{};
+/// for (ptrdiff_t n = 0; n < N; ++n) x[0] = x[0] + y[n];
+/// it should be optimized into
+/// int64_t x[1]{};
+/// auto xv = x[0];
+/// for (ptrdiff_t n = 0; n < N; ++n) xv = xv + y[n];
+/// x[0] = xv;
+/// However, the assignment `z[n]` should block the hoisting of the load/store
+/// and we can check that failure to hoist for verifying legality.
+constexpr inline void
+Addr::maybeReassociableReduction(const Dependencies &deps) {
+  // we only run for `this->isStore() && dst->isLoad()`
+  if (isLoad()) return;
+  // we should have a store whose first output edge is the load for
+  // the following iteration. This iter is the reverse-time edge.
+  auto edges{outputEdgeIDs(deps, getCurrentDepth())};
+  auto B = edges.begin();
+  if (B == edges.end()) return;
+  poly::Dependence::ID id{*B};
+  if (deps.revTimeEdge(id) < 0) return;
+  IR::Addr *dst = deps.output(id);
+  if (dst->isStore() || (getLoop() != dst->getLoop())) return;
+  // if we failed to hoist the `Addr` out of time-dims, then we cannot optimize.
+  if (getCurrentDepth() > deps.satLevel(id)) return;
+  if (reassociableReduction == dst) return; // multiple time dims, already found
+  auto *c = llvm::dyn_cast<IR::Compute>(getStoredVal());
+  if (!c) return;
+  if (findThroughReassociable(dst, c) != 1) return;
+  reassociableReduction = dst;
+  dst->reassociableReduction = this;
+}
+
+} // namespace IR
+namespace CostModeling {
+using poly::Dependence;
+// struct CPUExecutionModel {};
+
+template <typename T> using Vec = math::ResizeableView<T, ptrdiff_t>;
+
+// TODO: instead of this, update in-place and ensure all Addr are
+// over-allocated to correspond with max depth? Because we parse in reverse
+// order, we have max possible depth of `ScheduledNode`s using it at time we
+// create.
+
+/// LoopTree
+/// A tree of loops, with an indexable vector of IR::Loop*s, to facilitate
+/// construction of the IR::Loop graph, from the fusion omegas
+class LoopTree {
+  // The root of this subtree
+  Valid<IR::Loop> loop;
+  // LoopTree *parent{nullptr}; // do we need this?
+  Vec<LoopTree *> children{};
+  unsigned depth{0};
+  // We do not need to know the previous loop, as dependencies between
+  // the `Addr`s and instructions will determine the ordering.
+  constexpr LoopTree(Arena<> *lalloc, poly::Loop *L, LoopTree *parent_)
+    : loop{lalloc->create<IR::Loop>(parent_->depth + 1, L)},
+      depth(parent_->depth + 1) {
+    // allocate the root node, and connect it to parent's node, as well as
+    // previous loop of the same level.
+    loop->setParent(parent_->loop);
+  }
+  constexpr LoopTree(Arena<> *lalloc) : loop{lalloc->create<IR::Loop>(0)} {}
+
+public:
+  static auto root(Arena<> *salloc, Arena<> *lalloc) -> LoopTree * {
+    return new (salloc) LoopTree(lalloc);
+  }
+  // salloc: Short lived allocator, for the indexable `Vec`s
+  // Longer lived allocator, for the IR::Loop nodes
+  // NOLINTNEXTLINE(misc-no-recursion)
+  void addNode(Arena<> *salloc, Arena<> *lalloc, lp::ScheduledNode *node) {
+    if (node->getNumLoops() == depth) {
+      // Then it belongs here, and we add loop's dependencies.
+      // We only need to add deps to support SCC/top sort now.
+      // We also apply the rotation here.
+      // For dependencies in SCC iteration, only indvar deps get iterated.
+      auto [Pinv, denom] = math::NormalForm::scaledInv(node->getPhi());
+      Valid<poly::Loop> explicitLoop =
+        node->getLoopNest()->rotate(lalloc, Pinv, node->getOffset());
+      for (IR::Addr *m : node->localAddr()) {
+        m->rotate(explicitLoop, Pinv, denom, node->getOffsetOmega(),
+                  node->getOffset());
+        loop->insertAfter(m);
+      }
+      return;
+    }
+    // we need to find the sub-loop tree to which we add `node`
+    ptrdiff_t idx = node->getFusionOmega(depth);
+    invariant(idx >= 0);
+    ptrdiff_t numChildren = children.size();
+    if (idx >= children.size()) {
+      if (idx >= children.getCapacity()) {
+        // allocate extra capacity
+        children.reserve(salloc, 2 * (idx + 1));
+      }
+      // allocate new nodes and resize
+      children.resize(idx + 1);
+      for (ptrdiff_t i = numChildren; i < idx + 1; ++i)
+        children[i] = new (salloc) LoopTree{lalloc, node->getLoopNest(), this};
+      numChildren = idx + 1;
+    }
+    children[idx]->addNode(salloc, lalloc, node);
+  }
+  constexpr auto getChildren() -> Vec<LoopTree *> { return children; }
+  constexpr auto getLoop() -> IR::Loop * { return loop; }
+};
+
+inline void hoist(IR::Node *N, IR::Loop *P, int depth) {
+  N->setParent(P);
+  N->setCurrentDepth(depth);
+}
+
+struct LoopDepSummary {
+  IR::Node *afterExit{nullptr};
+  IR::Addr *indexedByLoop{nullptr};
+  IR::Addr *notIndexedByLoop{nullptr};
+};
+struct LoopIndependent {
+  LoopDepSummary summary;
+  bool independent;
+  constexpr auto operator*=(LoopIndependent other) -> LoopIndependent & {
+    summary = other.summary;
+    independent = independent && other.independent;
+    return *this;
+  }
+};
+/// inline auto searchLoopIndependentUsers(IR::Dependencies deps, IR::Loop *L,
+///                                        IR::Node *N, uint8_t depth,
+///                                        LoopDepSummary summary)
+///
+///   Searches `N` and it's users for loop-independent users, and returns them
+/// as a list to process.
+///   This exits early if it finds a dependent user, meaning it will only return
+/// a partial list in this case. We search the entire graph eventually, meaning
+/// the remainder will be processed later.
+///   We return a `LoopDepSummary, bool` pair, where the `bool` is true if `N`
+///   was
+/// loop independent. We use the `bool` rather than a `nullptr` or optional so
+/// that we can still return those results we did find on failure.
+///  NOLINTNEXTLINE(misc-no-recursion)
+inline auto searchLoopIndependentUsers(const IR::Dependencies &deps,
+                                       IR::Loop *L, IR::Node *N, int depth,
+                                       LoopDepSummary summary)
+  -> LoopIndependent {
+  if (N->dependsOnParentLoop()) return {summary, false};
+  if (llvm::isa<IR::Loop>(N)) return {summary, false};
+  if (IR::Loop *P = N->getLoop(); P != L)
+    return {summary, !(P && L->contains(P))};
+  LoopIndependent ret{summary, true};
+  auto *a = llvm::dyn_cast<IR::Addr>(N);
+  if (a) {
+    a->removeFromList();
+    if (a->indexedByInnermostLoop()) {
+      a->insertAfter(ret.summary.indexedByLoop);
+      ret.summary.indexedByLoop = a;
+      return {summary, false};
+    }
+    a->insertAfter(ret.summary.notIndexedByLoop);
+    ret.summary.notIndexedByLoop = a;
+    for (IR::Addr *m : a->unhoistableOutputs(deps, depth - 1)) {
+      ret *= searchLoopIndependentUsers(deps, L, m, depth, summary);
+      if (ret.independent) continue;
+      a->setDependsOnParentLoop();
+      return ret;
+    }
+  }
+  // if it isn't a Loop or Addr, must be an `Instruction`
+  IR::Value *I = llvm::cast<IR::Instruction>(N);
+  for (IR::Node *U : I->getUsers()) {
+    ret *= searchLoopIndependentUsers(deps, L, U, depth, summary);
+    if (ret.independent) continue;
+    I->setDependsOnParentLoop();
+    return ret;
+  }
+  // then we can push it to the front of the list, meaning it is hoisted out
+  if (a && (ret.summary.notIndexedByLoop == a))
+    ret.summary.notIndexedByLoop = llvm::cast_or_null<IR::Addr>(a->getNext());
+  I->removeFromList();
+  I->insertAfter(ret.summary.afterExit);
+  ret.summary.afterExit = I;
+  I->visit(depth);
+  return ret;
+}
+/// `R`: remove from loop, if not `nullptr`, set the parent of `N` to `R`
+/// `R` is applied recursivvely, forwarded to all calls.
+// NOLINTNEXTLINE(misc-no-recursion)
+inline auto visitLoopDependent(const IR::Dependencies &deps, IR::Loop *L,
+                               IR::Node *N, int depth, IR::Node *body,
+                               IR::Loop *R = nullptr) -> IR::Node * {
+  invariant(N->getVisitDepth() != 254);
+  // N may have been visited as a dependent of an inner loop, which is why
+  // `visited` accepts a depth argument
+  if (N->wasVisited(depth) || !(L->contains(N))) return body;
+#ifndef NDEBUG
+  // Our goal here is to check for cycles in debug mode.
+  // Each level of our graph is acyclic, meaning that there are no cycles at
+  // that level when traversing only edges active at that given level.
+  // However, when considering edges active at level `I`, we may have cycles
+  // at level `J` if `J>I`. In otherwords, here we are traversing all edges
+  // active at `I=depth`. Within subloops, which necessarily have depth
+  // `J>I`, we may have cycles.
+  //
+  // Thus, we need to prevent getting stuck in a cycle for these deeper loops
+  // by setting `N->visit(depth)` here, so `wasVisited` will allow them to
+  // immediately return. But, in debug mode, we'll set nodes of the same depth
+  // to `254` to check for cycles.
+  if (N->getLoop() == L) N->visit(254);
+  else N->visit(depth);
+#else
+  N->visit(depth);
+#endif
+  // iterate over users
+  if (auto *A = llvm::dyn_cast<IR::Addr>(N)) {
+    // Note that `topologicalSort` calls `searchLoopIndependentUsers` which
+    // checks whether an `Addr` is `indexedByInnermostLoop`.
+    //
+    // Note that here `depth` is `0` for top-level, 1 for the outer most loop,
+    // etc. That is, loops are effectively 1-indexed here, while `satLevel`
+    // is effectively 0-indexed by loop.
+    //   Example 1:
+    // for (ptrdiff_t m = 0; m < M; ++m)
+    //   for (ptrdiff_t n = 0; n < N; ++n)
+    //     for (ptrdiff_t k = 0; k < K; ++k) C[m,n] = C[m,n] + A[m,k]*B[k,n];
+    // we have cyclic dependencies between the load from/store to `C[m,n]`.
+    // The `C[m,n]` load -> `C[m,n]` store was not satisfied by any loop, so
+    // the sat level is 255.
+    // The `C[m,n]` store -> `C[m,n]` load has satLevel = 2.
+    //   Example 2:
+    // for (ptrdiff_t m = 0; m < M; ++m)
+    //   for (ptrdiff_t n = 1; n < N; ++n) C[m,n] = C[m,n] + C[m,n-1];
+    // we again have a cyple, from the load `C[m,n-1]` to the store `C[m,n]`,
+    // and from the store `C[m,n]` to the load `C[m,n-1]` on the following
+    // iteration.
+    // The former has a sat level of 255, while the latter has a sat level of
+    // `1`.
+    //
+    // isActive(depth) == satLevel() > depth
+    //
+    // a. load->store is not satisfied by any loop, instead handled by sorting
+    //    of instructions in the innermost loop, i.e. sat is depth=3.
+    // b. store->load is carried by the `k` loop, i.e. sat is depth=2.
+    //    Because `2 > (3-1) == false`, we do not add it here,
+    //    its sorting isn't positional!
+    //
+    // TODO:
+    // - [ ] I think the current algorithm may illegally hoist certain
+    //       dependencies carried on this loop. Specifically, we can hoist
+    //       addresses that (a) are not indexed by this loop, but need to be
+    //       repeated anyway because of some other address operation, while that
+    //       combination can't be moved to registers, e.g. because their index
+    //       matrices are not equal.
+    //       We need to distinguish between order within the loop, for the
+    //       purpose of this topsort, and placement with respect to the loop.
+    //       Simply, we perhaps should simply avoid hoisting when we carry
+    //       a dependence that doesn't meet the criteria of `unhoistableOutputs`
+    // - [ ] Incorporate the legality setting here?
+    for (IR::Addr *m : A->unhoistableOutputs(deps, depth - 1)) {
+      if (m->wasVisited(depth)) continue;
+      body = visitLoopDependent(deps, L, m, depth, body, R);
+    }
+  }
+  if (auto *I = llvm::dyn_cast<IR::Instruction>(N)) {
+    for (IR::Node *U : I->getUsers()) {
+      if (U->wasVisited(depth)) continue;
+      body = visitLoopDependent(deps, L, U, depth, body, R);
+    }
+  } else if (auto *S = llvm::dyn_cast<IR::Loop>(N)) {
+    for (IR::Node *U : S->getChild()->nodes()) {
+      if (U->wasVisited(depth)) continue;
+      body = visitLoopDependent(deps, L, U, depth, body, R);
+    }
+  }
+#ifndef NDEBUG
+  if (N->getLoop() == L) N->visit(depth);
+#endif
+  if (N->getLoop() == L) body = N->setNext(body);
+  if (R) hoist(N, R, depth - 1);
+  return body;
+}
+inline void addBody(const IR::Dependencies &deps, IR::Loop *root, int depth,
+                    IR::Node *nodes) {
+  IR::Exit exit{}; // use to capture last node
+  IR::Node *body{&exit};
+  for (IR::Node *N : nodes->nodes())
+    body = visitLoopDependent(deps, root, N, depth, body);
+  body = root->setChild(body); // now we can place the loop
+  IR::Node *last = exit.getPrev();
+  if (last) last->setNext(nullptr);
+  root->setLast(last);
+}
+inline void topologicalSort(const IR::Dependencies &deps, IR::Loop *root,
+                            int depth) {
+  // basic plan for the top sort:
+  // We iterate across all users, once all of node's users have been added,
+  // we push it to the front of the list. Thus, we get a top-sorted list.
+  // We're careful about the order, so that this top sort should LICM all the
+  // addresses that it can.
+  //
+  // We must push the exit before the root (as the exit depends on the loop,
+  // and we iterate users). The exit doesn't use any in this block, so we
+  // begin by trying to push any instructions that don't depend on the loop.
+  // If we fail to push them (i.e., because they have uses that do depend on
+  // the loop), then they get added to a revisit queue. Any instructions we
+  // are able to push-front before we push the exit, implicitly happen after
+  // the exit, i.e. they have been LICMed into the exit block. We unvisit the
+  // revisit-queue, and add them back to the main worklist. Then, we proceed
+  // with a depth-first topological sort normally (iterating over uses,
+  // pushing to the front), starting with the loop root, so that it gets
+  // pushed to the front as soon as possible. That is, so that it happens as
+  // late as possible Any instructions that get pushed to the front afterwards
+  // have been LICMed into the loop pre-header.
+  //
+  // In this first pass, we iterate over all nodes, pushing those
+  // that can be hoisted after the exit block.
+  //
+  IR::Node *C = root->getChild();
+  LoopDepSummary summary{};
+  for (IR::Node *N : C->nodes())
+    summary = searchLoopIndependentUsers(deps, root, N, depth, summary).summary;
+  // summary.afterExit will be hoisted out; every member has been marked as
+  // `visited` So, now we search all of root's users, i.e. every addr that
+  // depends on it
+  root->setNext(summary.afterExit);
+  IR::Loop *P = root->getLoop();
+  for (IR::Node *N : summary.afterExit->nodes()) hoist(N, P, depth - 1);
+  addBody(deps, root, depth, summary.indexedByLoop);
+  IR::Node *body{root};
+  for (IR::Node *N : summary.notIndexedByLoop->nodes())
+    body = visitLoopDependent(deps, root, N, depth, body, P);
+}
+// NOLINTNEXTLINE(misc-no-recursion)
+inline void buildSubGraph(const IR::Dependencies &deps, IR::Loop *root,
+                          int depth) {
+  // We build the instruction graph, via traversing the tree, and then
+  // top sorting as we recurse out
+  for (IR::Loop *child : root->subLoops())
+    buildSubGraph(deps, child, depth + 1);
+  // The very outer `root` needs to have all instr constituents
+  // we also need to add the last instruction of each loop as `last`
+  topologicalSort(deps, root, depth);
+}
+inline void buildGraph(const IR::Dependencies &deps, IR::Loop *root) {
+  // We build the instruction graph, via traversing the tree, and then
+  // top sorting as we recurse out
+  for (IR::Loop *child : root->subLoops()) buildSubGraph(deps, child, 1);
+
+  // The very outer `root` needs to have all instr constituents
+  // we also need to add the last instruction of each loop as `last`
+  addBody(deps, root, 0, root->getChild());
+  // Add top sort idx
+  uint32_t idx = 0; // we use ++idx, so only `const` have idx==0
+  for (IR::Node *n : root->allNodes()) n->setTopIndex(++idx);
+}
+
+inline auto addAddrToGraph(Arena<> *salloc, Arena<> *lalloc,
+                           lp::ScheduledNode *nodes) -> IR::Loop * {
+  auto s = salloc->scope();
+  // `root` is top level loop
+  LoopTree *root = LoopTree::root(salloc, lalloc);
+  for (lp::ScheduledNode *node : nodes->getAllVertices())
+    root->addNode(salloc, lalloc, node);
+  return root->getLoop();
+}
+// NOLINTNEXTLINE(misc-no-recursion)
+inline auto hasFutureReadsCore(dict::aset<llvm::BasicBlock *> &successors,
+                               llvm::Instruction *I) -> bool {
+  for (auto *U : I->users()) {
+    auto *UI = llvm::dyn_cast<llvm::Instruction>(U);
+    if (!UI) continue;
+    if (UI->mayReadFromMemory() && successors.count(UI->getParent()))
+      return true;
+    if (llvm::isa<llvm::GetElementPtrInst>(UI) &&
+        hasFutureReadsCore(successors, UI))
+      return true;
+    // TODO: don't just give up if we cast to int?
+    if (llvm::isa<llvm::PtrToIntInst>(UI) || llvm::isa<llvm::BitCastInst>(UI))
+      return true;
+  }
+  return false;
+}
+inline auto hasFutureReads(Arena<> *alloc, dict::set<llvm::BasicBlock *> &LBBs,
+                           llvm::Instruction *I) -> bool {
+  auto s = alloc->scope();
+  dict::aset<llvm::BasicBlock *> successors{alloc};
+  for (llvm::BasicBlock *S : llvm::successors(I->getParent()))
+    if (!LBBs.count(S)) successors.insert(S);
+  return hasFutureReadsCore(successors, I);
+}
+
+struct LoopDepSatisfaction {
+  IR::Dependencies &deps;
+  MutPtrVector<int32_t> loopDeps;
+
+  constexpr auto dependencyIDs(IR::Loop *L) {
+    return utils::VForwardRange{loopDeps.begin(), L->getEdge()};
+  }
+  constexpr auto depencencies(IR::Loop *L) {
+    return dependencyIDs(L) | deps.getEdgeTransform();
+  }
+};
+inline Legality::Legality(LoopDepSatisfaction &deps, IR::Loop *L) {
+  for (int32_t did : deps.dependencyIDs(L))
+    if (!update(deps.deps, L, did)) break;
+}
+inline auto Legality::update(poly::Dependencies &deps, IR::Loop *L, int32_t did)
+  -> bool {
+  // note: the dependence hasn't been rotated
+  Dependence d{deps.get(Dependence::ID{did})};
+  IR::Addr *in = d.out, *out = d.in;
+  utils::Optional<size_t> peel = deps.determinePeelDepth(L, did);
+  if (peel) peelFlag |= (1 << (*peel));
+
+  if (d.revTimeEdge()) {
+    bool reassociable = in->reassociableReductionPair() != out;
+    if (reassociable) ++unordered_reduction_count;
+    if (!reassociable) ++ordered_reduction_count;
+    return reorderable = reassociable || peel;
+  }
+  return reorderable = peel.hasValue();
+};
+
+class IROptimizer {
+  IR::Dependencies &deps;
+  IR::Cache &instructions;
+  dict::set<llvm::BasicBlock *> &LBBs;
+  dict::set<llvm::CallBase *> &eraseCandidates;
+  IR::Loop *root_;
+  MutPtrVector<int32_t> loopDeps;
+  Arena<> *lalloc_;
+  llvm::TargetLibraryInfo *TLI;
+
+  /// `loopDepSats` places the dependencies at the correct loop level so that
+  /// we can more easily check all dependencies carried by a particular loop.
+  /// We use these for checks w/ respect to unrolling and vectorization
+  /// legality.
+  /// The returned vector is an integer vector, giving a mapping of loops
+  /// to depencencies handled at that level.
+  /// We can use these dependencies for searching reductions for
+  /// trying to prove legality.
+  static auto loopDepSats(Arena<> *alloc, IR::Dependencies &deps,
+                          lp::LoopBlock::OptimizationResult res)
+    -> MutPtrVector<int32_t> {
+    MutPtrVector<int32_t> loopDeps{math::vector<int32_t>(alloc, deps.size())};
+    // place deps at sat level for loops
+    for (IR::Addr *a : res.addr.getAddr()) {
+      IR::Loop *L = a->getLoop();
+      for (int32_t id : a->inputEdgeIDs(deps)) {
+        uint8_t lvl = deps.satLevel(IR::Dependence::ID{id});
+        L->getLoopAtDepth(lvl)->addEdge(loopDeps, id);
+      }
+    }
+    return loopDeps;
+  }
+  [[nodiscard]] constexpr auto getLoopDeps() const -> LoopDepSatisfaction {
+    return {deps, loopDeps};
+  }
+  // this compares `a` with each of its active outputs.
+  inline void eliminateAddr(IR::Addr *a) {
+    for (int32_t id : a->outputEdgeIDs(deps, a->getCurrentDepth())) {
+      IR::Addr *b = deps.output(Dependence::ID{id});
+      // TODO: also check loop extants
+      if (a->indexMatrix() != b->indexMatrix() ||
+          a->getOffsetOmega() != b->getOffsetOmega())
+        return;
+      if (a->isStore()) {
+        // On a Write->Write, we remove the first write.
+        if (b->isStore()) return a->drop(deps);
+        // Write->Load, we will remove the load if it's in the same block as the
+        // write, and we can forward the stored value.
+        if (a->getLoop() != b->getLoop()) return;
+        instructions.replaceAllUsesWith(b, a->getStoredVal());
+        b->drop(deps);
+      } else if (b->isLoad()) { // Read->Read
+        // If they're not in the same loop, we need to reload anyway
+        if (a->getLoop() != b->getLoop()) return;
+        // If they're in the same loop, we can delete the second read
+        instructions.replaceAllUsesWith(b, a);
+        b->drop(deps);
+      } else return; // Read->Write, can't delete either
+    }
+  }
+  // we eliminate temporaries that meet these conditions:
+  // 1. are only ever stored to (this can be achieved via
+  // load-elimination/stored-val forwarding in `removeRedundantAddr`)
+  // 2. are non-escaping, i.e. `llvm::isNonEscapingLocalObject`
+  // 3. returned by `llvm::isRemovableAlloc`
+  inline auto eliminateTemporaries(IR::AddrChain addr) -> unsigned {
+    auto s = lalloc_->scope();
+    dict::aset<IR::Addr *> loaded{lalloc_};
+    for (IR::Addr *a : addr.getAddr())
+      if (a->isLoad()) loaded.insert(a);
+    unsigned remaining = 0;
+    for (IR::Addr *a : addr.getAddr()) {
+      if (a->isDropped()) continue;
+      ++remaining;
+      if (loaded.contains(a)) continue;
+      const llvm::SCEVUnknown *ptr = a->getArrayPointer();
+      auto *call = llvm::dyn_cast<llvm::CallBase>(ptr->getValue());
+      if (!call) continue;
+      if (!llvm::isNonEscapingLocalObject(call, nullptr)) continue;
+      if (!llvm::isRemovableAlloc(call, TLI)) continue;
+      if (hasFutureReads(lalloc_, LBBs, call)) continue;
+      a->drop(deps);
+      // we later check if any uses remain other than the associated free
+      // if not, we can delete them.
+      // We may want to go ahead and do this here. We don't for now,
+      // because we have live `llvm::Instruction`s that we haven't removed yet.
+      // TODO: revisit when handling code generation (and deleting old code)
+      eraseCandidates.insert(call);
+      --remaining;
+    }
+    return remaining;
+  }
+
+  // plan: SCC? Iterate over nodes in program order?
+  // then we can iterate in order.
+  // What to do about depth?
+  // We may have
+  // for (i : I){
+  //   for (j : J){
+  //     A[j] = x; // store
+  //     y = A[j]; // load
+  //   }
+  // }
+  // In this case, we do have a cycle:
+  // A[j]^s_i -> A[j]^l_i
+  // A[j]^l_i -> A[j]^s_{i+1}
+  // However, this cycle does not prohibit deleting the load,
+  // replacing it with `y = x`.
+  // This still holds true if the load were a second store:
+  // for (i : I){
+  //   for (j : J){
+  //     A[j] = x; // store
+  //     A[j] = y; // load
+  //   }
+  // }
+  // We could stick with the single `y` store.
+  // Thus, for eliminating memory operations at a depth of 2,
+  // we are only concerned with dependencies still valid at a depth of 2.
+  // for (int i = 0 : i < I; ++i){
+  //   x[i] /= U[i,i];
+  //   for (int j = i+1; j < I; ++j){
+  //     x[j] -= x[i]*U[i,j];
+  //   }
+  // }
+  // Maybe just do the dumb thing?
+  // Walk the graph for addr costs, and at the same time,
+  // check the addr for eliminability, checking against what we've stored thus
+  // far.
+  // We currently do not store load-load edges, which is why only checking
+  // edge relationships is not ideal.
+  // We may store load-load edges in the future, as these could be used as
+  // part of the cost function of the linear program, i.e. we'd want to
+  // minimize the distance between loads (but allow reordering them).
+  //
+  // I think a reasonable approach is:
+  // Have a map from array pointer to Addr. Addrs form a chain.
+  // as we walk the graph, add each newly encountered addr to the front of the
+  // chain and check if we can eliminate it, or any of its predecessors.
+  //
+  // Note (bracketed means we might be able to eliminate):
+  // Read->[Read] could eliminate read
+  // Read->Write no change
+  // Write->[Read] can forward written value
+  // [Write]->Write can eliminate first write
+  // Thus, we can fuse this pass with our address cost calculation.
+  // We check if we can eliminate before calculating the new cost.
+  // The only case where we may remove an old value, write->write,
+  // we could just take the old cost and assign it to the new write.
+  // TODO: if we have only writes to a non-escaping array, we should
+  // be able to eliminate these writes too, and then also potentially
+  // remove that array temporary (e.g., if it were malloc'd).
+  // E.g. check if the array is a `llvm::isNonEscapingLocalObject` and
+  // allocated by `llvm::isRemovableAlloc`.
+  void removeRedundantAddr(IR::AddrChain addr) {
+    // outputEdges are sorted topologically from first to last.
+    // Example:
+    // for (int i = 0; i < I; ++i){
+    //   acc = x[i];           // Statement: 0
+    //   for (int j = 0; j < i; ++j){
+    //     acc -= x[j]*U[j,i]; // Statement: 1
+    //   }
+    //   x[i] = acc;           // Statement: 2
+    //   x[i] = x[i] / U[i,i]; // Statement: 3
+    // }
+    // Here, we have a lot of redundant edges connecting the various `x[i]`s.
+    // We also have output edges between the `x[i]` and the `x[j]` load in
+    // statement 1. It is, however, satisfied at `x[i]`'s depth, and ignored.
+    // So, what would happen here:
+    // S0R->S2W, no change; break.
+    // S2W->S3R, replace read with stored value forwarding.
+    // S2W->S3W, remove S2W as it is shadowed by S3W.
+    // NOTE: we rely on the `ListRange` iterator supporting safely removing the
+    // current iter from the list.
+    for (IR::Addr *a : addr.getAddr()) eliminateAddr(a);
+  }
+  /// `sortEdges` sorts each `Addr`'s output edges
+  /// So that each `Addr`'s output edges are sorted based on the
+  /// topological ordering of the outputs.
+  /// The approach to sorting edges is to iterate through nodes backwards
+  /// whenever we encounter an `Addr`, we push it to the front of each
+  /// output edge list to which it belongs.
+  /// We also assigning each `Addr` an order by decrementing an integer each
+  /// time we encounter one. This is also necessary for Addr elimination, as we
+  /// want to find the first topologically greater Addr.
+  // NOLINTNEXTLINE(misc-no-recursion)
+  auto sortEdges(IR::Loop *R, int32_t pos) -> int32_t {
+    for (IR::Node *n = R->getLast(); n != R; n = n->getPrev()) {
+      if (auto *L = llvm::dyn_cast<IR::Loop>(n)) {
+        pos = sortEdges(L, pos);
+        continue;
+      }
+      auto *a = llvm::dyn_cast<IR::Addr>(n);
+      if (!a) continue;
+      a->setTopPosition(pos--);
+      // for each input edge, we push `a` to the front of the output list
+      for (int32_t id : a->inputEdgeIDs(deps)) {
+        if (deps.prevOut(Dependence::ID{id}) < 0) continue;
+        deps.removeOutEdge(id);
+        IR::Addr *b = deps.input(Dependence::ID{id});
+        int32_t oldFirst = b->getEdgeOut();
+        deps.prevOut(Dependence::ID{oldFirst}) = id;
+        deps.prevOut(Dependence::ID{id}) = -1;
+        deps.nextOut(Dependence::ID{id}) = oldFirst;
+        b->setEdgeOut(id);
+      }
+    }
+    return pos;
+  }
+  void findReductions(IR::AddrChain addr) {
+    for (IR::Addr *a : addr.getAddr()) a->maybeReassociableReduction(deps);
+  };
+
+public:
+  IROptimizer(IR::Dependencies &deps, IR::Cache &instr,
+              dict::set<llvm::BasicBlock *> &loopBBs,
+              dict::set<llvm::CallBase *> &eraseCandidates_, IR::Loop *root,
+              Arena<> *lalloc, lp::LoopBlock::OptimizationResult res)
+    : deps{deps}, instructions{instr}, LBBs{loopBBs},
+      eraseCandidates{eraseCandidates_}, root_{root}, lalloc_{lalloc} {
+    sortEdges(root_, 0);
+    removeRedundantAddr(res.addr);
+    unsigned numAddr = eliminateTemporaries(res.addr);
+    findReductions(res.addr);
+    loopDeps = loopDepSats(lalloc, deps, res);
+    /// TODO: legality check
+    // plan now is to have a `BitArray` big enough to hold `numLoops` entries
+    // and `numAddr` rows; final axis is contiguous vs non-contiguous
+    // Additionally, we will have a vector of unroll strategies to consider
+    // LoopDependencies *ld = LoopDependencies::create(lalloc_, numLoops,
+    // numAddr);
+  }
+};
+
+//
+// Considering reordering legality, example
+// for (int i = 0: i < I; ++i){
+//   for (int j = 0 : j < i; ++j){
+//     x[i] -= x[j]*U[j,i];
+//   }
+//   x[i] /= U[i,i];
+// }
+// We have an edge from the store `x[i] = x[i] / U[i,i]` to the load of
+// `x[j]`, when `j = ` the current `i`, on some future iteration.
+// We want to unroll;
+// for (int i = 0: i < I-3; i += 4){
+//   for (int j = 0 : j < i; ++j){
+//     x[i] -= x[j]*U[j,i];
+//     x[i+1] -= x[j]*U[j,i+1];
+//     x[i+2] -= x[j]*U[j,i+2];
+//     x[i+3] -= x[j]*U[j,i+3];
+//   }
+//   x[i] /= U[i,i]; // store 0
+//   { // perform unrolled j = i iter
+//     int j = i; // these all depend on store 0
+//     x[i+1] -= x[j]*U[j,i+1];
+//     x[i+2] -= x[j]*U[j,i+2];
+//     x[i+3] -= x[j]*U[j,i+3];
+//   }
+//   // j+1 iteration for i=i iter goes here (but doesn't happen)
+//   x[i+1] /= U[i+1,i+1]; // store 1
+//   { // perform unrolled j = i + 1 iter
+//     int j = i+1; // these all depend on store 1
+//     x[i+2] -= x[j]*U[j,i+2];
+//     x[i+3] -= x[j]*U[j,i+3];
+//   }
+//   // j+2 iteration for i=i iter goes here (but doesn't happen)
+//   // j+2 iteration for i=i+1 iter goes here (but doesn't happen)
+//   x[i+2] /= U[i+2,i+2]; // store 2
+//   { // perform unrolled j = i + 2 iter
+//     int j = i+2; // this depends on store 2
+//     x[i+3] -= x[j]*U[j,i+3];
+//   }
+//   // j+3 iteration for i=i iter goes here (but doesn't happen)
+//   // j+3 iteration for i=i+1 iter goes here (but doesn't happen)
+//   // j+3 iteration for i=i+2 iter goes here (but doesn't happen)
+//   x[i+3] /= U[i+3,i+3];
+// }
+// The key to legality here is that we peel off the dependence polyhedra
+// from the loop's iteration space.
+// We can then perform the dependent iterations in order.
+// With masking, the above code can be vectorized in this manner.
+// The basic approach is that we have the dependence polyhedra:
+//
+// 0 <= i_s < I
+// 0 <= i_l < I
+// 0 <= j_l < i_l
+// i_s = j_l // dependence, yields same address in `x`
+//
+// Note that our schedule sets
+// i_s = i_l
+// Which gives:
+// i_l = i_s = j_l < i_l
+// a contradiction, meaning that the dependency is
+// conditionally (on our schedule) satisfied.
+// Excluding the `i_s = i_l` constraint from the
+// polyhedra gives us the region of overlap.
+//
+// When unrolling by `U`, we get using `U=4` as an example:
+// i^0_s + 1 = i^1_s
+// i^0_s + 2 = i^2_s
+// i^0_s + 3 = i^3_s
+// 0 <= i^0_s < I
+// 0 <= i^1_s < I
+// 0 <= i^2_s < I
+// 0 <= i^3_s < I
+// 0 <= i^0_l < I
+// 0 <= i^1_l < I
+// 0 <= i^2_l < I
+// 0 <= i^3_l < I
+// 0 <= j_l < i^0_l
+// 0 <= j_l < i^1_l
+// 0 <= j_l < i^2_l
+// 0 <= j_l < i^3_l
+// i^0_s = j_l ||  i^1_s = j_l || i^2_s = j_l || i^3_s = j_l
+// where the final union can be replaced with
+// i^0_s = j_l ||  i^0_s+1 = j_l || i^0_s+2 = j_l || i^0_s+3 = j_l
+// i^0_s <= j_1 <= i^0_s+3
+//
+// Similarly, we can compress the other inequalities...
+// 0 <= i^0_s < I - 3
+// 0 <= i^0_l < I - 3
+// 0 <= j_l < i^0_l
+// i^0_s <= j_1 <= i^0_s+3 // dependence region
+//
+// So, the parallel region is the union
+// i^0_s > j_1 || j_1 > i^0_s+3
+//
+// In this example, note that the region `j_1 > i^0_s+3` is empty
+// so we have one parallel region, and then one serial region.
+//
+// Lets consider simpler checks. We have
+// [ 1 0 ] : x[i] -=
+// [ 0 1 ] : x[j]
+// [ 1 ]   : x[i] /=
+// we have a dependency when `i == j`. `i` carries the dependency, but we can
+// peel off the independent iters from `j`, and unroll `i` for these.
+//
+// How to identify:
+// [ 1 -1 ]
+// vs, if we had two `x[i]` or two `x[j]`
+// [ 0, 0 ]
+// An idea: look for non-zero so we can peel?
+// Or should we look specifically for `x[i] == x[j]` type pattern?
+// E.g., if we had
+// [ i,  j, k,  l ]
+// [ 2, -1, 2, -1 ]
+// we'd need a splitting algorithm.
+// E.g., split on the 2nd loop, so we get `j == 2*i + 2*k - l`
+// With this, we'd split iterations into groups
+// j  < 2*i + 2*k - l
+// j == 2*i + 2*k - l
+// j  > 2*i + 2*k - l
+// Subsetting the `k` and `l` iteration spaces may be a little annoying,
+// so we may initially want to restrict ourselves to peeling the innermost loop.
+///
+/// Optimize the schedule
+inline void optimize(IR::Dependencies deps, IR::Cache &instr,
+                     dict::set<llvm::BasicBlock *> &loopBBs,
+                     dict::set<llvm::CallBase *> &eraseCandidates,
+                     Arena<> *lalloc, lp::LoopBlock::OptimizationResult res) {
+  // we must build the IR::Loop
+  // Initially, to help, we use a nested vector, so that we can index into it
+  // using the fusion omegas. We allocate it with the longer lived `instr`
+  // alloc, so we can checkpoint it here, and use alloc for other IR nodes.
+  // The `instr` allocator is more generally the longer lived allocator,
+  // as it allocates the actual nodes.
+
+  IR::Loop *root = addAddrToGraph(instr.getAllocator(), lalloc, res.nodes);
+  buildGraph(deps, root);
+  // `N` is the head of the topologically sorted graph
+  // We now try to remove redundant memory operations
+
+  IROptimizer(deps, instr, loopBBs, eraseCandidates, root, lalloc, res);
+}
+
+/*
+// NOLINTNEXTLINE(misc-no-recursion)
+inline auto printSubDotFile(Arena<> *alloc, llvm::raw_ostream &out,
+                          map<LoopTreeSchedule *, std::string> &names,
+                          llvm::SmallVectorImpl<std::string> &addrNames,
+                          unsigned addrIndOffset, poly::Loop *lret)
+-> poly::Loop * {
+poly::Loop *loop{nullptr};
+size_t j = 0;
+for (auto *addr : header.getAddr()) loop = addr->getAffLoop();
+for (auto &subTree : subTrees) {
+  // `names` might realloc, relocating `names[this]`
+  if (getDepth())
+    names[subTree.subTree] = names[this] + "SubLoop#" + std::to_string(j++);
+  else names[subTree.subTree] = "LoopNest#" + std::to_string(j++);
+  if (loop == nullptr)
+    for (auto *addr : subTree.exit.getAddr()) loop = addr->getAffLoop();
+  loop = subTree.subTree->printSubDotFile(alloc, out, names, addrNames,
+                                          addrIndOffset, loop);
+}
+const std::string &name = names[this];
+out << "\"" << name
+    << "\" [shape=plain\nlabel = <<table><tr><td port=\"f0\">";
+// assert(depth == 0 || (loop != nullptr));
+if (loop && (getDepth() > 0)) {
+  for (size_t i = loop->getNumLoops(), k = getDepth(); i > k;)
+    loop = loop->removeLoop(alloc, --i);
+  loop->pruneBounds(alloc);
+  loop->printBounds(out);
+} else out << "Top Level";
+out << "</td></tr>\n";
+size_t i = header.printDotNodes(out, 0, addrNames, addrIndOffset, name);
+j = 0;
+std::string loopEdges;
+for (auto &subTree : subTrees) {
+  std::string label = "f" + std::to_string(++i);
+  out << " <tr> <td port=\"" << label << "\"> SubLoop#" << j++
+      << "</td></tr>\n";
+  loopEdges += "\"" + name + "\":f" + std::to_string(i) + " -> \"" +
+               names[subTree.subTree] + "\":f0 [color=\"#ff0000\"];\n";
+  i = subTree.exit.printDotNodes(out, i, addrNames, addrIndOffset, name);
+}
+out << "</table>>];\n" << loopEdges;
+if (lret) return lret;
+if ((loop == nullptr) || (getDepth() <= 1)) return nullptr;
+return loop->removeLoop(alloc, getDepth() - 1);
+}
+
+inline void printDotFile(Arena<> *alloc, llvm::raw_ostream &out) {
+map<LoopTreeSchedule *, std::string> names;
+llvm::SmallVector<std::string> addrNames(numAddr_);
+names[this] = "toplevel";
+out << "digraph LoopNest {\n";
+auto p = alloc.scope();
+printSubDotFile(alloc, out, names, addrNames, subTrees.size(), nullptr);
+printDotEdges(out, addrNames);
+out << "}\n";
+}
+*/
+// class LoopForestSchedule : LoopTreeSchedule {
+//   [[no_unique_address]] Arena<> *allocator;
+// };
+} // namespace CostModeling
+
+namespace IR {
+
+inline void Loop::setLegality(CostModeling::LoopDepSatisfaction &deps) {
+  for (int32_t did : deps.dependencyIDs(this))
+    if (!legality.update(deps.deps, this, did)) break;
+}
+
+} // namespace IR
+} // namespace poly
diff --git a/include/Optimize/Legality.hpp b/include/Optimize/Legality.hpp
new file mode 100644
index 000000000..b84b8c0d7
--- /dev/null
+++ b/include/Optimize/Legality.hpp
@@ -0,0 +1,196 @@
+#pragma once
+#ifndef POLY_LEGALITY_HPP_INCLUDED
+#define POLY_LEGALITY_HPP_INCLUDED
+
+#include <cstdint>
+
+namespace poly {
+namespace IR {
+class Loop;
+class Addr;
+}; // namespace IR
+namespace poly {
+struct Dependence;
+class Dependencies;
+}; // namespace poly
+namespace CostModeling {
+
+struct LoopDepSatisfaction;
+
+// If a loop doesn't carry a dependency, it is legal
+// If a loop does carry a dependency, we can still consider
+// unrolling and vectorization if at least one of:
+// - that depenedncy is a reassociable reduction
+// - the overlap is for a bounded number of iters, in which case we can peel
+// Contains:
+// - `getReduction()` enum indicating
+// none vs unordered vs ordered
+// - `minDistance()`, indicates the minimum distance
+// between dependent loop iterations.
+// for (ptrdiff_t i; i<I; ++i) x[i+8] = foo(x[i])
+// would have a value of `8`, i.e. we can evaluate <=8
+// contiguous iterations at a time in parallel safely.
+// - `maxDistance()` is the opposite: the maximum
+// distance of dependencies from the current iteration.
+// In the above example, the value is also `8`.
+// This is useful for considering, e.g., trapezoidal tiling.
+// - `maxIters()` - maximum number of iterations in which a dependence is held
+//
+// Note that it is always legal to unroll an innermost loop (scalarizing).
+// But we need reorderability for unroll and jam.
+// For example, this loop carries a dependency
+// example 0:
+// for (ptrdiff_t i = 1; i < x.size(); ++i)
+//   x[i] += x[i-1];
+// but we may wish to unroll it to reduce the amount of `mov` instructions
+// needed, as well as `i` increments.
+// However, if we had some other loop dependent on this
+//
+// example 1:
+// for (ptrdiff_t i = 1; i < x.size(); ++i){
+//   decltype(y[0,0]/x[0]) s = 0;
+//   for (ptrdiff_t j = 0; j < y.size(); ++j)
+//     s += y[i,j] / x[i-1];
+//   x[i] += s * x[i-1];
+// }
+// an unroll and jam would be illegal.
+// TODO: what if the innermost loop isn't dependent?
+// example 2:
+// for (ptrdiff_t i = 1; i < x.size(); ++i){
+//   decltype(y[0,0]+y[0,0]) s = 0;
+//   for (ptrdiff_t j = 0; j < y.size(); ++j)
+//     s += y[i,j];
+//   x[i] += s * x[i-1];
+// }
+// Here, we can unroll and jam.
+// example 3:
+//
+// for (ptrdiff_t i = 1; i < x.size()-3; i+=4){
+//   decltype(y[0,0]+y[0,0]) s0 = 0;
+//   decltype(y[0,0]+y[0,0]) s1 = 0;
+//   decltype(y[0,0]+y[0,0]) s2 = 0;
+//   decltype(y[0,0]+y[0,0]) s3 = 0;
+//   for (ptrdiff_t j = 0; j < y.size(); ++j){
+//     s0 += y[i,j];
+//     s1 += y[i+1,j];
+//     s2 += y[i+2,j];
+//     s3 += y[i+3,j];
+//   }
+//   x[i] += s0 * x[i-1];
+//   x[i+1] += s1 * x[i];
+//   x[i+2] += s2 * x[i+1];
+//   x[i+3] += s3 * x[i+2];
+// }
+//
+//
+// So we can generalize to say, we can always unroll the innermost where the
+// addr are read.
+//
+// example 4:
+// for (i : I)
+//   for (j : J)
+//     for (k : K)
+//       for (l : L)
+//         B[i,j] += A[i+k,j+l] * K[k,l];
+//
+//
+// TODO items:
+// [x] Store time deps in cycle w/in `Dependencies` object so we can iterate
+//     over all of them.
+// [ ] Check `Addr` hoisting code for how it handles reductions, ensuring we can
+//     hoist them out.
+// [ ] Fuse legality checking, at least in part, with it, as that may indicate
+//     unrolling in example 3 above.
+// [ ] See discussionin CostModeling.hpp above `optimize` about unrolling.
+// Okay, we'll take a somewhat different approach:
+// it shouldn't be too difficult to check for extra outputs, etc.
+// so we do that all here, after the `Addr` placements and simplifications
+//
+// For examples 2-3 above, we should have a concept of must-scalarize this
+// loop's execution, but that we can vectorize/reorder it within subloops.
+class Legality {
+  // enum class Illegal : uint8_t {
+  //   None = 0,
+  //   Unroll = 1,
+  //   ReorderThis = 2,
+  //   ReorderSubLoops = 4
+  // };
+  uint16_t peelFlag{0};
+  // TODO: use min and max distance!
+  // uint16_t mindistance{std::numeric_limits<uint16_t>::max()};
+  // uint8_t maxdistance{0};
+  uint16_t ordered_reduction_count{0};
+  uint16_t unordered_reduction_count{0};
+  bool reorderable{true};
+  // uint8_t illegalFlag{0};
+
+public:
+  // [[nodiscard]] constexpr auto minDistance() const -> uint16_t {
+  //   return mindistance;
+  // }
+  // [[nodiscard]] constexpr auto maxDistance() const -> uint16_t {
+  //   return maxdistance;
+  // }
+  // [[nodiscard]] constexpr auto noUnroll() const -> bool {
+  //   return illegalFlag & uint8_t(Illegal::Unroll);
+  // }
+  // [[nodiscard]] constexpr auto canUnroll() const -> bool { return
+  // !noUnroll(); }
+  constexpr auto operator&=(Legality other) -> Legality & {
+    ordered_reduction_count += other.ordered_reduction_count;
+    unordered_reduction_count += other.unordered_reduction_count;
+    // mindistance = std::min(mindistance, other.mindistance);
+    // maxdistance = std::max(maxdistance, other.maxdistance);
+    peelFlag |= other.peelFlag;
+    // illegalFlag |= other.illegalFlag;
+    return *this;
+  }
+  constexpr auto operator=(const Legality &) -> Legality & = default;
+  [[nodiscard]] constexpr auto operator&(Legality other) const -> Legality {
+    Legality l{*this};
+    return l &= other;
+  }
+  constexpr Legality() = default;
+  constexpr Legality(const Legality &) = default;
+  Legality(LoopDepSatisfaction &deps, IR::Loop *L);
+  // deeperAccess(const poly::Dependencies &deps, IR::Loop *L, IR::Addr *in)
+  // are any of the outputs of `in` in a subloop of `L`
+  // static auto deeperAccess(const poly::Dependencies &deps, IR::Loop *L,
+  //                          IR::Addr *in) -> bool {
+  //   return std::ranges::any_of(in->outputEdgeIDs(deps),
+  //                              [&](int32_t id) -> bool {
+  //                                IR::Addr *a =
+  //                                deps.output(Dependence::ID{id}); return
+  //                                (a->getLoop() != L) && L->contains(a);
+  //                              });
+  // }
+  // inline auto anyInteriorDependents(IR::Loop *L, IR::Addr *out) -> bool {
+  //   return std::ranges::any_of(out->outputEdgeIDs(*this),
+  //                              [&](int32_t i) -> bool {
+  //                                IR::Addr *a = output(Dependence::ID{i});
+  //                                return (a->getLoop() != L) &&
+  //                                L->contains(a);
+  //                              });
+  // }
+
+  // inline auto anyInteriorDependencies(IR::Loop *L, IR::Addr *in) -> bool {
+
+  //   return std::ranges::any_of(in->inputEdgeIDs(*this), [&](int32_t i) ->
+  //   bool {
+  //     IR::Addr *a = input(Dependence::ID{i});
+  //     return (a->getLoop() != L) && L->contains(a);
+  //   });
+  // }
+  auto update(poly::Dependencies &deps, IR::Loop *L, int32_t did) -> bool;
+  constexpr auto numReductions() const -> uint16_t {
+    uint16_t numReduct;
+    if (__builtin_add_overflow(ordered_reduction_count,
+                               unordered_reduction_count, &numReduct))
+      return std::numeric_limits<uint16_t>::max();
+    return numReduct;
+  }
+};
+static_assert(sizeof(Legality) == 8);
+} // namespace CostModeling
+} // namespace poly
+#endif // POLY_LEGALITY_HPP_INCLUDED
diff --git a/include/Optimize/RegisterFile.hpp b/include/Optimize/RegisterFile.hpp
new file mode 100644
index 000000000..3ede8b7d6
--- /dev/null
+++ b/include/Optimize/RegisterFile.hpp
@@ -0,0 +1,81 @@
+#pragma once
+#ifndef RegisterFile_hpp_INCLUDED
+#define RegisterFile_hpp_INCLUDED
+
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/IR/DerivedTypes.h>
+
+namespace poly::RegisterFile {
+// returns vector width in bytes, ignoring mprefer-vector-width
+inline auto estimateMaximumVectorWidth(llvm::LLVMContext &C,
+                                       const llvm::TargetTransformInfo &TTI)
+  -> uint8_t {
+  uint8_t twiceMaxVectorWidth = 2;
+  auto *f32 = llvm::Type::getFloatTy(C);
+  llvm::InstructionCost prevCost = TTI.getArithmeticInstrCost(
+    llvm::Instruction::FAdd,
+    llvm::FixedVectorType::get(f32, twiceMaxVectorWidth));
+  while (true) {
+    llvm::InstructionCost nextCost = TTI.getArithmeticInstrCost(
+      llvm::Instruction::FAdd,
+      llvm::FixedVectorType::get(f32, twiceMaxVectorWidth *= 2));
+    if (nextCost > prevCost) break;
+    prevCost = nextCost;
+  }
+  return 2 * twiceMaxVectorWidth;
+}
+
+class CPURegisterFile {
+  uint8_t maximumVectorWidth;
+  uint8_t numVectorRegisters;
+  uint8_t numGeneralPurposeRegisters;
+  uint8_t numPredicateRegisters;
+
+#if defined(__x86_64__)
+  // hacky check for has AVX512
+  static inline auto hasAVX512(llvm::LLVMContext &C,
+                               const llvm::TargetTransformInfo &TTI) -> bool {
+    return TTI.isLegalMaskedExpandLoad(
+      llvm::FixedVectorType::get(llvm::Type::getDoubleTy(C), 8));
+  }
+#else
+  // assume we're not cross-compiling to x64 from some other arch to reduce the
+  // risk of false positives
+  static constexpr hasAVX512(llvm::LLVMContext &,
+                             const llvm::TargetTransformInfo &)
+    ->bool {
+    return false;
+  }
+#endif
+
+  static auto estimateNumPredicateRegisters(
+    llvm::LLVMContext &C, const llvm::TargetTransformInfo &TTI) -> uint8_t {
+    if (TTI.supportsScalableVectors()) return 8;
+    // hacky check for AVX512
+    if (hasAVX512(C, TTI)) return 7; // 7, because k0 is reserved for unmasked
+    return 0;
+  }
+
+public:
+  CPURegisterFile(llvm::LLVMContext &C, const llvm::TargetTransformInfo &TTI) {
+    maximumVectorWidth = estimateMaximumVectorWidth(C, TTI);
+    numVectorRegisters = TTI.getNumberOfRegisters(true);
+    numGeneralPurposeRegisters = TTI.getNumberOfRegisters(false);
+    numPredicateRegisters = estimateNumPredicateRegisters(C, TTI);
+  }
+  [[nodiscard]] constexpr auto getNumVectorBits() const -> uint8_t {
+    return maximumVectorWidth;
+  }
+  [[nodiscard]] constexpr auto getNumVector() const -> uint8_t {
+    return numVectorRegisters;
+  }
+  [[nodiscard]] constexpr auto getNumScalar() const -> uint8_t {
+    return numGeneralPurposeRegisters;
+  }
+  [[nodiscard]] constexpr auto getNumPredicate() const -> uint8_t {
+    return numPredicateRegisters;
+  }
+};
+
+} // namespace poly::RegisterFile
+#endif // RegisterFile_hpp_INCLUDED
diff --git a/include/Polyhedra/Comparators.hpp b/include/Polyhedra/Comparators.hpp
index 829de427e..306d64033 100644
--- a/include/Polyhedra/Comparators.hpp
+++ b/include/Polyhedra/Comparators.hpp
@@ -1,6 +1,8 @@
 #pragma once
 
 #include "Utilities/Optional.hpp"
+#include <Alloc/Arena.hpp>
+#include <Alloc/Mallocator.hpp>
 #include <Math/Array.hpp>
 #include <Math/Constraints.hpp>
 #include <Math/EmptyArrays.hpp>
@@ -9,13 +11,10 @@
 #include <Math/NormalForm.hpp>
 #include <Math/Simplex.hpp>
 #include <Math/VectorGreatestCommonDivisor.hpp>
-#include <Utilities/Allocators.hpp>
 #include <Utilities/Invariant.hpp>
 #include <algorithm>
-#include <cassert>
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 
 namespace poly::comparator {
 using math::PtrVector, math::MutPtrVector, math::Vector, math::_, math::Row,
@@ -24,7 +23,7 @@ using math::PtrVector, math::MutPtrVector, math::Vector, math::_, math::Row,
   math::NormalForm::simplifySystemsImpl, math::NormalForm::solveSystem,
   math::StridedVector, math::vector, math::matrix, math::identity,
   math::Simplex, math::DenseDims, math::DenseMatrix;
-using utils::invariant, utils::Arena, utils::Optional;
+using utils::invariant, alloc::Arena, utils::Optional;
 // For `== 0` constraints
 struct EmptyComparator {
   static constexpr auto getNumConstTerms() -> ptrdiff_t { return 0; }
@@ -126,9 +125,9 @@ template <typename T> struct BaseComparator {
                                             PtrVector<int64_t> y) const
     -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(delta.size() >= N);
-    assert(x.size() >= N);
-    assert(y.size() >= N);
+    invariant(delta.size() >= N);
+    invariant(x.size() >= N);
+    invariant(y.size() >= N);
     for (ptrdiff_t n = 0; n < N; ++n) delta[n] = x[n] - y[n];
     return static_cast<const T *>(this)->greaterEqual(delta);
   }
@@ -145,8 +144,8 @@ template <typename T> struct BaseComparator {
   [[nodiscard]] constexpr auto greater(PtrVector<int64_t> x,
                                        PtrVector<int64_t> y) const -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(N <= x.size());
-    assert(N <= y.size());
+    invariant(N <= x.size());
+    invariant(N <= y.size());
     Vector<int64_t> delta(N);
     for (ptrdiff_t n = 0; n < N; ++n) delta[n] = x[n] - y[n];
     --delta[0];
@@ -170,7 +169,7 @@ template <typename T> struct BaseComparator {
   [[nodiscard]] constexpr auto lessEqual(MutPtrVector<int64_t> x) const
     -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(N <= x.size());
+    invariant(N <= x.size());
     for (ptrdiff_t n = 0; n < N; ++n) x[n] *= -1;
     bool ret = static_cast<const T *>(this)->greaterEqual(x);
     for (ptrdiff_t n = 0; n < N; ++n) x[n] *= -1;
@@ -178,7 +177,7 @@ template <typename T> struct BaseComparator {
   }
   [[nodiscard]] constexpr auto lessEqual(PtrVector<int64_t> x) const -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(N <= x.size());
+    invariant(N <= x.size());
     Vector<int64_t> y{x[_(0, N)]};
     return lessEqual(y);
   }
@@ -193,13 +192,13 @@ template <typename T> struct BaseComparator {
   [[nodiscard]] constexpr auto lessEqual(PtrVector<int64_t> x, int64_t y) const
     -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(N <= x.size());
+    invariant(N <= x.size());
     Vector<int64_t> z{x[_(0, N)]};
     return lessEqual(z, y);
   }
   [[nodiscard]] constexpr auto less(MutPtrVector<int64_t> x) const -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(N <= x.size());
+    invariant(N <= x.size());
     int64_t x0 = x[0];
     x[0] = -x0 - 1;
     for (ptrdiff_t i = 1; i < N; ++i) x[i] *= -1;
@@ -210,7 +209,7 @@ template <typename T> struct BaseComparator {
   }
   [[nodiscard]] constexpr auto less(PtrVector<int64_t> x) const -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(N <= x.size());
+    invariant(N <= x.size());
     Vector<int64_t> y{x[_(0, N)]};
     return less(y);
   }
@@ -223,7 +222,7 @@ template <typename T> struct BaseComparator {
   [[nodiscard]] constexpr auto greater(PtrVector<int64_t> x) const -> bool {
     // TODO: avoid this needless memcopy and (possible) allocation?
     const ptrdiff_t N = getNumConstTerms();
-    assert(N <= x.size());
+    invariant(N <= x.size());
     Vector<int64_t> xm{x[_(0, N)]};
     return greater(math::view(xm));
   }
@@ -237,8 +236,8 @@ template <typename T> struct BaseComparator {
                                              PtrVector<int64_t> y) const
     -> bool {
     const ptrdiff_t N = getNumConstTerms();
-    assert(x.size() >= N);
-    assert(y.size() >= N);
+    invariant(x.size() >= N);
+    invariant(y.size() >= N);
     if (x[_(0, N)] == y[_(0, N)]) return true;
     Vector<int64_t> delta{x[_(0, N)] - y[_(0, N)]};
     return equal(delta);
@@ -264,8 +263,8 @@ concept Comparator = requires(T t, PtrVector<int64_t> x, int64_t y) {
 
 template <typename T>
 struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
-  [[no_unique_address]] unsigned int numVar{0};
-  [[no_unique_address]] unsigned int numEquations{0};
+  [[no_unique_address]] ptrdiff_t numVar{0};
+  [[no_unique_address]] ptrdiff_t numEquations{0};
   using ThisT = BaseSymbolicComparator<T>;
   using BaseT = BaseComparator<ThisT>;
   using BaseT::greaterEqual;
@@ -291,16 +290,16 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
   [[nodiscard]] constexpr auto getD() const -> PtrVector<int64_t> {
     return static_cast<const T *>(this)->getDImpl();
   }
-  constexpr auto getV(Row r, Col c) -> MutDensePtrMatrix<int64_t> {
+  constexpr auto getV(Row<> r, Col<> c) -> MutDensePtrMatrix<int64_t> {
     return static_cast<T *>(this)->getVImpl(r, c);
   }
-  constexpr auto getU(Row r, Col c) -> MutDensePtrMatrix<int64_t> {
+  constexpr auto getU(Row<> r, Col<> c) -> MutDensePtrMatrix<int64_t> {
     return static_cast<T *>(this)->getUImpl(r, c);
   }
-  constexpr auto getD(Row n) -> MutPtrVector<int64_t> {
+  constexpr auto getD(Row<> n) -> MutPtrVector<int64_t> {
     return static_cast<T *>(this)->getDImpl(n);
   }
-  constexpr void setURank(Row r) { static_cast<T *>(this)->setURankImpl(r); }
+  constexpr void setURank(Row<> r) { static_cast<T *>(this)->setURankImpl(r); }
   [[nodiscard]] constexpr auto getURank() const -> ptrdiff_t {
     return static_cast<const T *>(this)->getURankImpl();
   }
@@ -320,14 +319,14 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     const ptrdiff_t numConExplicit = ptrdiff_t(A.numRow()) + 1;
     const ptrdiff_t numConTotal = numConExplicit + numNonNegative;
     numVar = ptrdiff_t(A.numCol());
-    Row rowV = Row{numVar + numConTotal};
-    Col colV = Col{2 * numConTotal};
+    Row rowV = Row<>{numVar + numConTotal};
+    Col colV = Col<>{2 * numConTotal};
     /// B.size() == (A.numCol() + A.numRow() + 1 + numNonNegative) x
     ///             (2 * (A.numRow() + 1 + numNonNegative))
     ///
     auto B = getV(rowV, colV);
-    std::fill_n(B.begin(), B.numRow() * B.numCol(), 0);
-    B(0, 0) = 1;
+    std::fill_n(B.begin(), ptrdiff_t(B.numRow()) * ptrdiff_t(B.numCol()), 0);
+    B[0, 0] = 1;
     // B = [ A_0 A_1
     //        0   I  ]
     // V = [B' 0
@@ -335,12 +334,12 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     // V = [A_0'  0  0
     //      A_1'  I  0
     //      S_0  S_1 I]
-    B(_(begin, numVar), _(1, numConExplicit)) << A.transpose();
+    B[_(begin, numVar), _(1, numConExplicit)] << A.t();
     for (ptrdiff_t j = 0; j < numNonNegative; ++j)
-      B(j + numVar - numNonNegative, numConExplicit + j) = 1;
+      B[j + numVar - numNonNegative, numConExplicit + j] = 1;
     for (ptrdiff_t j = 0; j < numConTotal; ++j) {
-      B(j + numVar, j) = -1;
-      B(j + numVar, j + numConTotal) = 1;
+      B[j + numVar, j] = -1;
+      B[j + numVar, j + numConTotal] = 1;
     }
     numEquations = numConTotal;
     initCore(alloc);
@@ -354,11 +353,11 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     const ptrdiff_t numInEqConTotal = numInEqConExplicit + numNonNegative;
     const ptrdiff_t numEqCon = ptrdiff_t(E.numRow());
     numVar = ptrdiff_t(A.numCol());
-    Row rowV = Row{numVar + numInEqConTotal};
-    Col colV = Col{2 * numInEqConTotal + numEqCon};
+    Row rowV = Row<>{numVar + numInEqConTotal};
+    Col colV = Col<>{2 * numInEqConTotal + numEqCon};
     auto B = getV(rowV, colV);
-    std::fill_n(B.begin(), B.numRow() * B.numCol(), 0);
-    B(0, 0) = 1;
+    std::fill_n(B.begin(), ptrdiff_t(B.numRow()) * ptrdiff_t(B.numCol()), 0);
+    B[0, 0] = 1;
     // B is `A` augmented with the implicit non-negative constraints
     // B = [ A_0 A_1
     //        0   I  ]
@@ -368,17 +367,17 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     //      A_1'  I  E_1' 0
     //      S_0  S_1  0   I]
     numEquations = numInEqConTotal + numEqCon;
-    B(_(begin, numVar), _(1, numInEqConExplicit)) << A.transpose();
-    B(_(begin, numVar), _(numInEqConTotal, numInEqConTotal + numEqCon))
-      << E.transpose();
+    B[_(begin, numVar), _(1, numInEqConExplicit)] << A.t();
+    B[_(begin, numVar), _(numInEqConTotal, numInEqConTotal + numEqCon)]
+      << E.t();
     if (numNonNegative)
-      B(_(numVar - numNonNegative, numVar),
-        _(numInEqConExplicit, numInEqConExplicit + numNonNegative))
+      B[_(numVar - numNonNegative, numVar),
+        _(numInEqConExplicit, numInEqConExplicit + numNonNegative)]
           .diag()
         << 1;
     for (ptrdiff_t j = 0; j < numInEqConTotal; ++j) {
-      B(j + numVar, j) = -1;
-      B(j + numVar, j + numEquations) = 1;
+      B[j + numVar, j] = -1;
+      B[j + numVar, j + numEquations] = 1;
     }
     initCore(alloc);
   }
@@ -386,10 +385,11 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
   [[nodiscard]] static constexpr auto
   memoryNeededNonNegative(PtrMatrix<int64_t> A, EmptyMatrix<int64_t>,
                           ptrdiff_t numNonNegative) -> ptrdiff_t {
-    return memoryNeededImpl(A.numRow(), A.numCol(), Row{0}, ++numNonNegative);
+    return memoryNeededImpl(A.numRow(), A.numCol(), Row<>{0}, ++numNonNegative);
   }
   [[nodiscard]] inline static constexpr auto
-  memoryNeededImpl(Row Ar, Col Ac, Row Er, ptrdiff_t numPos) -> ptrdiff_t {
+  memoryNeededImpl(Row<> Ar, Col<> Ac, Row<> Er, ptrdiff_t numPos)
+    -> ptrdiff_t {
     // alternative:
     ptrdiff_t numInEqConTotal = ptrdiff_t(Ar) + numPos;
     ptrdiff_t colV = (numInEqConTotal << 1) + ptrdiff_t(Er);
@@ -399,7 +399,7 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
   [[nodiscard]] static constexpr auto
   memoryNeededNonNegative(PtrMatrix<int64_t> A, ptrdiff_t numNonNegative)
     -> ptrdiff_t {
-    return memoryNeededImpl(A.numRow(), A.numCol(), Row{0}, ++numNonNegative);
+    return memoryNeededImpl(A.numRow(), A.numCol(), Row<>{0}, ++numNonNegative);
   }
   [[nodiscard]] static constexpr auto
   memoryNeededNonNegative(PtrMatrix<int64_t> A, PtrMatrix<int64_t> E,
@@ -410,11 +410,11 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
   [[nodiscard]] static constexpr auto memoryNeeded(PtrMatrix<int64_t> A,
                                                    EmptyMatrix<int64_t>,
                                                    bool pos0) -> ptrdiff_t {
-    return memoryNeededImpl(A.numRow(), A.numCol(), Row{0}, pos0);
+    return memoryNeededImpl(A.numRow(), A.numCol(), Row<>{0}, pos0);
   }
   [[nodiscard]] static constexpr auto memoryNeeded(PtrMatrix<int64_t> A,
                                                    bool pos0) -> ptrdiff_t {
-    return memoryNeededImpl(A.numRow(), A.numCol(), Row{0}, pos0);
+    return memoryNeededImpl(A.numRow(), A.numCol(), Row<>{0}, pos0);
   }
   [[nodiscard]] static constexpr auto memoryNeeded(PtrMatrix<int64_t> A,
                                                    PtrMatrix<int64_t> E,
@@ -425,17 +425,17 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
                       bool pos0) {
     const ptrdiff_t numCon = ptrdiff_t(A.numRow()) + pos0;
     numVar = ptrdiff_t(A.numCol());
-    Row rowV = numVar + numCon;
-    Col colV = 2 * numCon;
+    Row<> rowV = {numVar + numCon};
+    Col<> colV = {2 * numCon};
     auto B = getV(rowV, colV);
-    std::fill_n(B.begin(), B.numRow() * B.numCol(), 0);
-    B(0, 0) = pos0;
+    std::fill_n(B.begin(), ptrdiff_t(B.numRow()) * ptrdiff_t(B.numCol()), 0);
+    B[0, 0] = pos0;
     // V = [A' 0
     //      S  I]
-    B(_(begin, numVar), _(pos0, numCon)) << A.transpose();
+    B[_(begin, numVar), _(pos0, numCon)] << A.t();
     for (ptrdiff_t j = 0; j < numCon; ++j) {
-      B(j + numVar, j) = -1;
-      B(j + numVar, j + numCon) = 1;
+      B[j + numVar, j] = -1;
+      B[j + numVar, j + numCon] = 1;
     }
     numEquations = numCon;
     initCore(alloc);
@@ -449,21 +449,21 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     const ptrdiff_t numInEqCon = ptrdiff_t(A.numRow()) + pos0;
     numVar = ptrdiff_t(A.numCol());
     const ptrdiff_t numEqCon = ptrdiff_t(E.numRow());
-    Row rowV = Row{numVar + numInEqCon};
-    Col colV = Col{2 * numInEqCon + numEqCon};
+    Row rowV = Row<>{numVar + numInEqCon};
+    Col colV = Col<>{2 * numInEqCon + numEqCon};
     auto B = getV(rowV, colV);
     B << 0;
     // V = [A' E' 0
     //      S  0  I]
-    B(0, 0) = pos0;
-    B(_(begin, numVar), _(pos0, numInEqCon)) << A.transpose();
-    // A(_, _(pos0, end)).transpose();
-    B(_(begin, numVar), _(numInEqCon, numInEqCon + numEqCon)) << E.transpose();
+    B[0, 0] = pos0;
+    B[_(begin, numVar), _(pos0, numInEqCon)] << A.t();
+    // A(_, _(pos0, end)).t();
+    B[_(begin, numVar), _(numInEqCon, numInEqCon + numEqCon)] << E.t();
 
     numEquations = numInEqCon + numEqCon;
     for (ptrdiff_t j = 0; j < numInEqCon; ++j) {
-      B(j + numVar, j) = -1;
-      B(j + numVar, j + numEquations) = 1;
+      B[j + numVar, j] = -1;
+      B[j + numVar, j + numEquations] = 1;
     }
     initCore(alloc);
   }
@@ -477,7 +477,7 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     U.diag() << 1;
     // We will have query of the form Ax = q;
     simplifySystemsImpl({B, U});
-    while ((R) && allZero(B(R - 1, _))) --R;
+    while ((R) && allZero(B[ptrdiff_t(R) - 1, _])) --R;
     setURank(R);
     ptrdiff_t numColB = ptrdiff_t(B.numCol());
     // upper bounded by numVar + numInEq x numVar + numInEq
@@ -490,15 +490,15 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     auto Vt{identity<int64_t>(alloc, numColB)};
     // Ht.numRow() > Ht.numCol() = R
     // (2*numInEq + numEq) x R
-    auto Ht = matrix<int64_t>(alloc, Row{numColB}, Col{ptrdiff_t(R)});
-    Ht << B(_(0, R), _).transpose();
+    auto Ht = matrix<int64_t>(alloc, Row<>{numColB}, Col<>{ptrdiff_t(R)});
+    Ht << B[_(0, R), _].t();
     solveSystem(Ht, Vt);
     // upper bounded by numVar + numInEq
     // rows/cols, but of rank R
     // smaller based on rank
     getD(R) << Ht.diag(); // d.size() == R
     // upper bounded by 2*numInEq + numEq x 2*numInEq + numEq
-    getV() << Vt.transpose();
+    getV() << Vt.t();
   }
 
   // Note that this is only valid when the comparator was constructed
@@ -507,26 +507,21 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     auto V = getV();
     auto U = getU();
     auto d = getD();
-    StridedVector<int64_t> b{U(_, 0)};
+    StridedVector<int64_t> b{U[_, 0]};
     if (d.empty()) {
       if (!allZero(b[_(V.numRow(), end)])) return false;
       Col oldn = V.numCol();
-      auto H{matrix<int64_t>(&alloc, V.numRow(), oldn + 1)};
+      auto H{matrix<int64_t>(&alloc, V.numRow(), ++auto{oldn})};
       // IntMatrix H{V.numRow(), oldn + 1};
-      H(_, _(0, oldn)) << V;
-      H(_, oldn) << -b;
+      H[_, _(0, oldn)] << V;
+      H[_, oldn] << -b;
       solveSystem(H);
-      bool ret = true;
       for (ptrdiff_t i = numEquations; i < H.numRow(); ++i)
-        if (auto rhs = H(i, oldn))
-          if ((rhs > 0) != (H(i, i) > 0)) {
-            ret = false;
-            break;
-          }
-      return ret;
+        if ((H[i, oldn] > 0) != (H[i, i] > 0)) return false;
+      return true;
     }
     // Column rank deficient case
-    Row numSlack = V.numRow() - numEquations;
+    Row numSlack = Row<>{ptrdiff_t(V.numRow()) - numEquations};
     // Vector<int64_t> dinv = d; // copy
     // We represent D martix as a vector, and multiply the lcm to the
     // linear equation to avoid store D^(-1) as rational type
@@ -535,33 +530,33 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     b2 << -b * lcmD / d;
     // Vector<int64_t> b2 = -b * Dlcm / d;
     ptrdiff_t numRowTrunc = ptrdiff_t(U.numRow());
-    auto c{vector<int64_t>(&alloc, ptrdiff_t(V.numRow() - numEquations))};
-    c << V(_(numEquations, end), _(begin, numRowTrunc)) * b2;
+    auto c{vector<int64_t>(&alloc, ptrdiff_t(V.numRow()) - numEquations)};
+    c << b2 * V[_(numEquations, end), _(begin, numRowTrunc)].t();
     // Vector<int64_t> c = V(_(numEquations, end), _(begin, numRowTrunc)) *
     // b2;
-    auto dimNS = V.numCol() - numRowTrunc;
+    ptrdiff_t dimNS = ptrdiff_t(V.numCol()) - numRowTrunc;
     // expand W stores [c -JV2 JV2]
     //  we use simplex to solve [-JV2 JV2][y2+ y2-]' <= JV1D^(-1)Uq
     // where y2 = y2+ - y2-
-    auto expandW{matrix<int64_t>(&alloc, Row{numSlack}, Col{dimNS * 2 + 1})};
+    auto expandW{matrix<int64_t>(&alloc, numSlack, Col<>{dimNS * 2 + 1})};
     for (ptrdiff_t i = 0; i < numSlack; ++i) {
-      expandW(i, 0) = c[i];
+      expandW[i, 0] = c[i];
       // expandW(i, 0) *= Dlcm;
       for (ptrdiff_t j = 0; j < dimNS; ++j) {
-        auto val = V(i + numEquations, numRowTrunc + j) * lcmD;
-        expandW(i, j + 1) = -val;
-        expandW(i, dimNS + 1 + j) = val;
+        auto val = V[i + numEquations, numRowTrunc + j] * lcmD;
+        expandW[i, j + 1] = -val;
+        expandW[i, dimNS + 1 + j] = val;
       }
     }
     return Simplex::positiveVariables(&alloc, expandW).hasValue();
   }
   [[nodiscard]] constexpr auto isEmpty() const -> bool {
-    utils::OwningArena<> alloc;
+    alloc::OwningArena<> alloc;
     return isEmpty(alloc);
   }
   [[nodiscard]] constexpr auto greaterEqual(PtrVector<int64_t> query) const
     -> bool {
-    utils::OwningArena<> alloc;
+    alloc::OwningArena<> alloc;
     return greaterEqual(alloc, query);
   }
   [[nodiscard]] constexpr auto greaterEqualFullRank(Arena<> *alloc,
@@ -569,16 +564,15 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     -> bool {
     auto V = getV();
     if (!allZero(b[_(V.numRow(), end)])) return false;
-    auto H = matrix<int64_t>(alloc, V.numRow(), V.numCol() + 1);
+    auto H = matrix<int64_t>(alloc, V.numRow(), ++auto{V.numCol()});
     Col oldn = V.numCol();
-    H(_, _(0, oldn)) << V;
+    H[_, _(0, oldn)] << V;
     // H.numRow() == b.size(), because we're only here if dimD == 0,
     // in which case V.numRow() == U.numRow() == b.size()
-    H(_, oldn) << b;
+    H[_, oldn] << b;
     solveSystem(H);
     for (ptrdiff_t i = numEquations; i < H.numRow(); ++i)
-      if (auto rhs = H(i, oldn))
-        if ((rhs > 0) != (H(i, i) > 0)) return false;
+      if ((H[i, oldn] > 0) != (H[i, i] > 0)) return false;
     return true;
   }
   [[nodiscard]] constexpr auto
@@ -586,7 +580,7 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
     -> bool {
     auto V = getV();
     auto d = getD();
-    Row numSlack = V.numRow() - numEquations;
+    Row numSlack = Row<>{ptrdiff_t(V.numRow()) - numEquations};
     auto dinv = vector<int64_t>(alloc, d.size());
     dinv << d; // copy
     // We represent D martix as a vector, and multiply the lcm to the
@@ -598,20 +592,20 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
       b[i] *= x;
     }
     ptrdiff_t numRowTrunc = getURank();
-    auto c = vector<int64_t>(alloc, unsigned(V.numRow() - numEquations));
-    c << V(_(numEquations, end), _(begin, numRowTrunc)) * b;
-    auto dimNS = V.numCol() - numRowTrunc;
+    auto c = vector<int64_t>(alloc, ptrdiff_t(V.numRow()) - numEquations);
+    c << b * V[_(numEquations, end), _(begin, numRowTrunc)].t();
+    auto dimNS = ptrdiff_t(V.numCol()) - numRowTrunc;
     // expand W stores [c -JV2 JV2]
     //  we use simplex to solve [-JV2 JV2][y2+ y2-]' <= JV1D^(-1)Uq
     // where y2 = y2+ - y2-
-    auto expandW = matrix<int64_t>(alloc, numSlack, dimNS * 2 + 1);
+    auto expandW = matrix<int64_t>(alloc, numSlack, Col<>{dimNS * 2 + 1});
     for (ptrdiff_t i = 0; i < numSlack; ++i) {
-      expandW(i, 0) = c[i];
+      expandW[i, 0] = c[i];
       // expandW(i, 0) *= Dlcm;
       for (ptrdiff_t j = 0; j < dimNS;) {
-        auto val = V(i + numEquations, numRowTrunc + j++) * lcmD;
-        expandW(i, j) = -val;
-        expandW(i, dimNS + j) = val;
+        auto val = V[i + numEquations, numRowTrunc + j++] * lcmD;
+        expandW[i, j] = -val;
+        expandW[i, dimNS + j] = val;
       }
     }
     Optional<Simplex *> optS{Simplex::positiveVariables(alloc, expandW)};
@@ -621,8 +615,8 @@ struct BaseSymbolicComparator : BaseComparator<BaseSymbolicComparator<T>> {
                                             PtrVector<int64_t> query) const
     -> bool {
     auto U = getU();
-    auto b = vector<int64_t>(&alloc, unsigned(U.numRow()));
-    b << U(_, _(begin, query.size())) * query;
+    auto b = vector<int64_t>(&alloc, ptrdiff_t(U.numRow()));
+    b << query * U[_, _(begin, query.size())].t();
     return getD().size() ? greaterEqualRankDeficient(&alloc, b)
                          : greaterEqualFullRank(&alloc, b);
   }
@@ -631,7 +625,7 @@ struct LinearSymbolicComparator
   : public BaseSymbolicComparator<LinearSymbolicComparator> {
   using Base = BaseSymbolicComparator<LinearSymbolicComparator>;
   using Base::init;
-  using Matrix = math::ManagedArray<int64_t, DenseDims>;
+  using Matrix = math::ManagedArray<int64_t, DenseDims<>>;
   [[no_unique_address]] Matrix U;
   [[no_unique_address]] Matrix V;
   [[no_unique_address]] Vector<int64_t> d;
@@ -648,7 +642,7 @@ struct LinearSymbolicComparator
     return d;
   }
 
-  constexpr void setURankImpl(Row r) {
+  constexpr void setURankImpl(Row<> r) {
     V.truncate(r);
     U.truncate(r);
   }
@@ -661,18 +655,18 @@ struct LinearSymbolicComparator
   [[nodiscard]] constexpr auto getURankImpl() const -> ptrdiff_t {
     return ptrdiff_t(U.numRow());
   }
-  constexpr auto getUImpl(Row r, Col c) -> MutDensePtrMatrix<int64_t> {
+  constexpr auto getUImpl(Row<> r, Col<> c) -> MutDensePtrMatrix<int64_t> {
     U.resizeForOverwrite(r, c);
     return U;
   }
-  constexpr auto getVImpl(Row r, Col c) -> MutDensePtrMatrix<int64_t> {
+  constexpr auto getVImpl(Row<> r, Col<> c) -> MutDensePtrMatrix<int64_t> {
     V.setSize(r, c);
-    U.setSize(r, Col{ptrdiff_t(r)});
+    U.setSize(r, Col<>{ptrdiff_t(r)});
     return V;
   }
-  constexpr auto getDImpl(Row N) -> MutPtrVector<int64_t> {
+  constexpr auto getDImpl(Row<> N) -> MutPtrVector<int64_t> {
     d.resizeForOverwrite(ptrdiff_t(N));
-    V.resizeForOverwrite(Row{ptrdiff_t{V.numCol()}});
+    V.resizeForOverwrite(Row<>{ptrdiff_t{V.numCol()}});
     return d;
   }
   static constexpr auto construct(PtrMatrix<int64_t> Ap, EmptyMatrix<int64_t>,
@@ -682,14 +676,14 @@ struct LinearSymbolicComparator
   static constexpr auto construct(PtrMatrix<int64_t> Ap, bool pos0)
     -> LinearSymbolicComparator {
     LinearSymbolicComparator cmp;
-    std::allocator<int64_t> alloc{};
+    alloc::Mallocator<int64_t> alloc{};
     cmp.init(alloc, Ap, pos0);
     return cmp;
   };
   static constexpr auto construct(PtrMatrix<int64_t> Ap, PtrMatrix<int64_t> Ep,
                                   bool pos0) -> LinearSymbolicComparator {
     LinearSymbolicComparator cmp;
-    std::allocator<int64_t> alloc{};
+    alloc::Mallocator<int64_t> alloc{};
     cmp.init(alloc, Ap, Ep, pos0);
     return cmp;
   };
@@ -703,7 +697,7 @@ struct LinearSymbolicComparator
                                         ptrdiff_t numNonNeg)
     -> LinearSymbolicComparator {
     LinearSymbolicComparator cmp;
-    std::allocator<int64_t> alloc{};
+    alloc::Mallocator<int64_t> alloc{};
     cmp.initNonNegative(alloc, Ap, numNonNeg);
     return cmp;
   };
@@ -712,7 +706,7 @@ struct LinearSymbolicComparator
                                         ptrdiff_t numNonNeg)
     -> LinearSymbolicComparator {
     LinearSymbolicComparator cmp;
-    std::allocator<int64_t> alloc{};
+    alloc::Mallocator<int64_t> alloc{};
     cmp.initNonNegative(alloc, Ap, Ep, numNonNeg);
     return cmp;
   };
@@ -722,15 +716,12 @@ struct PtrSymbolicComparator
   using Base = BaseSymbolicComparator<PtrSymbolicComparator>;
   using Base::init;
   int64_t *mem;
-  // unsigned int numVar;
-  // unsigned int numInEq;
-  // unsigned int numEq;
-  unsigned int rankU{0};
-  unsigned int colU{0};
-  unsigned int dimV{0};
-  unsigned int dimD{0};
+  ptrdiff_t rankU{0};
+  ptrdiff_t colU{0};
+  ptrdiff_t dimV{0};
+  ptrdiff_t dimD{0};
 
-  constexpr void setURankImpl(Row r) { rankU = unsigned(r); }
+  constexpr void setURankImpl(Row<> r) { rankU = ptrdiff_t(r); }
   [[nodiscard]] constexpr auto getURankImpl() const -> ptrdiff_t {
     return rankU;
   }
@@ -747,18 +738,15 @@ struct PtrSymbolicComparator
   // }
   // NOLINTNEXTLINE(readability-make-member-function-const)
   constexpr auto getUImpl() -> MutDensePtrMatrix<int64_t> {
-    return {mem, DenseDims{rankU, colU}};
+    return {mem, DenseDims<>{{rankU}, {colU}}};
   }
   // A = V
   // H = A
   // H.truncate(Row());
   // size is H.numCol() * H.numCol()
-  [[nodiscard]] constexpr auto numVRows() const -> unsigned {
-    return dimD ? dimV : rankU;
-  }
   // offset by (numVar + numInEq)*(numVar + numInEq)
   constexpr auto getVImpl() -> MutDensePtrMatrix<int64_t> {
-    return {getUImpl().end(), DenseDims{numVRows(), dimV}};
+    return {getUImpl().end(), DenseDims<>{numVRows(), Col<>{dimV}}};
   }
   // size D
   constexpr auto getDImpl() -> MutPtrVector<int64_t> {
@@ -766,24 +754,25 @@ struct PtrSymbolicComparator
     return {getVImpl().end(), dimD};
   }
   [[nodiscard]] constexpr auto getUImpl() const -> DensePtrMatrix<int64_t> {
-    return {mem, DenseDims{rankU, colU}};
+    return {mem, DenseDims<>{Row<>{rankU}, Col<>{colU}}};
   }
   [[nodiscard]] constexpr auto getVImpl() const -> DensePtrMatrix<int64_t> {
-    return {mem + ptrdiff_t(rankU) * colU, DenseDims{numVRows(), dimV}};
+    return {mem + ptrdiff_t(rankU) * colU,
+            DenseDims<>{numVRows(), Col<>{dimV}}};
   }
   [[nodiscard]] constexpr auto getDImpl() const -> PtrVector<int64_t> {
     return {mem + ptrdiff_t(rankU) * colU + ptrdiff_t(numVRows()) * dimV, dimD};
   }
   // constexpr auto getUImpl(Row r, Col c) -> MutPtrMatrix<int64_t> {}
-  constexpr auto getVImpl(Row r, Col c) -> MutDensePtrMatrix<int64_t> {
-    colU = rankU = unsigned(r);
-    dimV = unsigned(c);
+  constexpr auto getVImpl(Row<> r, Col<> c) -> MutDensePtrMatrix<int64_t> {
+    colU = rankU = ptrdiff_t(r);
+    dimV = ptrdiff_t(c);
     getUImpl() << 0;
     dimD = 0;
     return getVImpl();
   }
-  constexpr auto getDImpl(Row r) -> MutPtrVector<int64_t> {
-    dimD = unsigned(r);
+  constexpr auto getDImpl(Row<> r) -> MutPtrVector<int64_t> {
+    dimD = ptrdiff_t(r);
     invariant(dimD > 0);
     return getDImpl();
   }
@@ -831,6 +820,10 @@ struct PtrSymbolicComparator
   };
 
 private:
+  [[nodiscard]] constexpr auto numVRows() const -> Row<> {
+    return {ptrdiff_t(dimD ? dimV : rankU)};
+  }
+
   constexpr PtrSymbolicComparator(int64_t *p) : mem(p) {}
 };
 
@@ -839,24 +832,24 @@ static_assert(Comparator<LinearSymbolicComparator>);
 
 constexpr void moveEqualities(DenseMatrix<int64_t> &, EmptyMatrix<int64_t>,
                               const Comparator auto &) {}
-constexpr void moveEqualities(DenseMatrix<int64_t> &A, math::IntMatrix &E,
+constexpr void moveEqualities(DenseMatrix<int64_t> &A, math::IntMatrix<> &E,
                               const Comparator auto &C) {
   const ptrdiff_t numVar = ptrdiff_t(E.numCol());
-  assert(A.numCol() == numVar);
+  invariant(A.numCol() == numVar);
   if (A.numRow() <= 1) return;
   for (ptrdiff_t o = ptrdiff_t(A.numRow()) - 1; o > 0;) {
     for (ptrdiff_t i = o--; i < A.numRow(); ++i) {
       bool isNeg = true;
       for (ptrdiff_t v = 0; v < numVar; ++v) {
-        if (A(i, v) != -A(o, v)) {
+        if (A[i, v] != -A[o, v]) {
           isNeg = false;
           break;
         }
       }
-      if (isNeg && C.equalNegative(A(i, _), A(o, _))) {
+      if (isNeg && C.equalNegative(A[i, _], A[o, _])) {
         ptrdiff_t e = ptrdiff_t(E.numRow());
-        E.resize(e + 1, numVar);
-        for (ptrdiff_t v = 0; v < numVar; ++v) E(e, v) = A(i, v);
+        E.resize(Row<>{e + 1}, Col<>{numVar});
+        for (ptrdiff_t v = 0; v < numVar; ++v) E[e, v] = A[i, v];
         eraseConstraint(A, i, o);
         break;
       }
@@ -865,7 +858,7 @@ constexpr void moveEqualities(DenseMatrix<int64_t> &A, math::IntMatrix &E,
 }
 
 // NOLINTNEXTLINE(performance-unnecessary-value-param)
-constexpr auto linear(std::allocator<int64_t>, PtrMatrix<int64_t> A,
+constexpr auto linear(alloc::Mallocator<int64_t>, PtrMatrix<int64_t> A,
                       EmptyMatrix<int64_t>, bool pos0) {
   return LinearSymbolicComparator::construct(A, pos0);
 }
@@ -874,7 +867,7 @@ constexpr auto linear(Arena<> *alloc, PtrMatrix<int64_t> A,
   return PtrSymbolicComparator::construct(alloc, A, pos0);
 }
 // NOLINTNEXTLINE(performance-unnecessary-value-param)
-constexpr auto linear(std::allocator<int64_t>, PtrMatrix<int64_t> A,
+constexpr auto linear(alloc::Mallocator<int64_t>, PtrMatrix<int64_t> A,
                       PtrMatrix<int64_t> E, bool pos0) {
   return LinearSymbolicComparator::construct(A, E, pos0);
 }
@@ -884,8 +877,9 @@ constexpr auto linear(Arena<> *alloc, PtrMatrix<int64_t> A,
 }
 
 // NOLINTNEXTLINE(performance-unnecessary-value-param)
-constexpr auto linearNonNegative(std::allocator<int64_t>, PtrMatrix<int64_t> A,
-                                 EmptyMatrix<int64_t>, ptrdiff_t numNonNeg) {
+constexpr auto linearNonNegative(alloc::Mallocator<int64_t>,
+                                 PtrMatrix<int64_t> A, EmptyMatrix<int64_t>,
+                                 ptrdiff_t numNonNeg) {
   return LinearSymbolicComparator::constructNonNeg(A, numNonNeg);
 }
 constexpr auto linearNonNegative(Arena<> *alloc, PtrMatrix<int64_t> A,
@@ -893,8 +887,9 @@ constexpr auto linearNonNegative(Arena<> *alloc, PtrMatrix<int64_t> A,
   return PtrSymbolicComparator::constructNonNeg(alloc, A, numNonNeg);
 }
 // NOLINTNEXTLINE(performance-unnecessary-value-param)
-constexpr auto linearNonNegative(std::allocator<int64_t>, PtrMatrix<int64_t> A,
-                                 PtrMatrix<int64_t> E, ptrdiff_t numNonNeg) {
+constexpr auto linearNonNegative(alloc::Mallocator<int64_t>,
+                                 PtrMatrix<int64_t> A, PtrMatrix<int64_t> E,
+                                 ptrdiff_t numNonNeg) {
   return LinearSymbolicComparator::constructNonNeg(A, E, numNonNeg);
 }
 constexpr auto linearNonNegative(Arena<> *alloc, PtrMatrix<int64_t> A,
diff --git a/include/Polyhedra/Dependence.hpp b/include/Polyhedra/Dependence.hpp
index 626b2aeb7..059747c48 100644
--- a/include/Polyhedra/Dependence.hpp
+++ b/include/Polyhedra/Dependence.hpp
@@ -1,11 +1,16 @@
 #pragma once
 #include "IR/Address.hpp"
+#include "IR/Node.hpp"
+#include "Math/Array.hpp"
+#include "Math/Simplex.hpp"
 #include "Polyhedra/DependencyPolyhedra.hpp"
 #include "Polyhedra/Loops.hpp"
 #include "Polyhedra/Schedule.hpp"
 #include "Support/Iterators.hpp"
+#include <Alloc/Arena.hpp>
+#include <Containers/Tuple.hpp>
 #include <Math/Constructors.hpp>
-#include <Utilities/Allocators.hpp>
+#include <Math/SOA.hpp>
 #include <Utilities/Invariant.hpp>
 #include <cstdint>
 #include <ranges>
@@ -17,20 +22,29 @@ namespace poly {
 /// Represents a dependence relationship between two memory accesses.
 /// It contains simplices representing constraints that affine schedules
 /// are allowed to take.
-class Dependence {
-public:
+struct Dependence {
+
+  // public:
   struct ID {
     int32_t id;
+    [[nodiscard]] constexpr explicit operator bool() const { return id >= 0; }
+  };
+  // TODO: revert to `bool` flag for `Forward`?
+  enum MetaFlags : uint8_t {
+    Forward = 1,
+    FreeOfDeeperDeps = 2,
+    Reassociable = 4,
+    NotReassociable = 8
   };
 
-private:
+  // private:
   //
   //
-  NotNull<DepPoly> depPoly;
-  NotNull<math::Simplex> dependenceSatisfaction;
-  NotNull<math::Simplex> dependenceBounding;
-  NotNull<IR::Addr> in;
-  NotNull<IR::Addr> out;
+  Valid<DepPoly> depPoly;
+  Valid<math::Simplex> dependenceSatisfaction;
+  Valid<math::Simplex> dependenceBounding;
+  Valid<IR::Addr> in;
+  Valid<IR::Addr> out;
   // Dependence *nextInput{nullptr}; // all share same `in`
   // Dependence *nextOutput{nullptr};
   // // all share same `out`
@@ -39,14 +53,24 @@ class Dependence {
   // // was because of offsets when solving the linear program (value =
   // // 1).
   // std::array<uint8_t, 7> satLvl{255, 255, 255, 255, 255, 255, 255};
+  ID revTimeEdge_{-1};
   std::array<uint8_t, 2> satLvl;
-  bool forward;
+  uint8_t meta{0};
+  uint8_t peel{255}; // sentinal value for cannot peel
+
+  // template <size_t I> [[nodiscard]] auto get() const -> const auto & {
+  //   if constexpr (I == 0) return depPoly;
+  //   else if constexpr (I==1) return dependenceSatisfaction;
+  //   else if constexpr (I==1) return dependenceBounding;
+  // }
 
-  constexpr auto getSimplexPair() -> std::array<NotNull<math::Simplex>, 2> {
+  constexpr auto getSimplexPair() -> std::array<Valid<math::Simplex>, 2> {
     return {dependenceSatisfaction, dependenceBounding};
   }
+  [[nodiscard]] constexpr auto getMeta() const -> uint8_t { return meta; }
+  [[nodiscard]] constexpr auto getPeel() const -> uint8_t { return peel; }
 
-public:
+  // public:
   friend class Dependencies;
   // constexpr auto getNextInput() -> Dependence * { return nextInput; }
   // [[nodiscard]] constexpr auto getNextInput() const -> const Dependence * {
@@ -58,32 +82,30 @@ class Dependence {
   // [[nodiscard]] constexpr auto getNextOutput() const -> const Dependence * {
   //   return nextOutput;
   // }
-  [[nodiscard]] constexpr auto input() -> NotNull<IR::Addr> { return in; }
-  [[nodiscard]] constexpr auto output() -> NotNull<IR::Addr> { return out; }
-  [[nodiscard]] constexpr auto input() const -> NotNull<const IR::Addr> {
-    return in;
-  }
-  [[nodiscard]] constexpr auto output() const -> NotNull<const IR::Addr> {
-    return out;
+  [[nodiscard]] constexpr auto input() const -> Valid<IR::Addr> { return in; }
+  [[nodiscard]] constexpr auto output() const -> Valid<IR::Addr> { return out; }
+  [[nodiscard]] constexpr auto revTimeEdge() const -> ID {
+    return revTimeEdge_;
   }
+  [[nodiscard]] constexpr auto peelable() const -> bool { return peel != 255; }
   // constexpr auto setNextInput(Dependence *n) -> Dependence * {
   //   return nextInput = n;
   // }
   // constexpr auto setNextOutput(Dependence *n) -> Dependence * {
   //   return nextOutput = n;
   // }
-  constexpr Dependence(NotNull<DepPoly> poly,
-                       std::array<NotNull<math::Simplex>, 2> depSatBound,
-                       NotNull<IR::Addr> i, NotNull<IR::Addr> o, bool fwd)
-    : depPoly(poly), dependenceSatisfaction(depSatBound[0]),
-      dependenceBounding(depSatBound[1]), in(i), out(o), forward(fwd) {}
-  constexpr Dependence(NotNull<DepPoly> poly,
-                       std::array<NotNull<math::Simplex>, 2> depSatBound,
-                       NotNull<IR::Addr> i, NotNull<IR::Addr> o,
-                       std::array<uint8_t, 2> sL, bool fwd)
-    : depPoly(poly), dependenceSatisfaction(depSatBound[0]),
-      dependenceBounding(depSatBound[1]), in(i), out(o), satLvl(sL),
-      forward(fwd) {}
+  // constexpr Dependence(Valid<DepPoly> poly,
+  //                      std::array<Valid<math::Simplex>, 2> depSatBound,
+  //                      Valid<IR::Addr> i, Valid<IR::Addr> o, bool fwd)
+  //   : depPoly(poly), dependenceSatisfaction(depSatBound[0]),
+  //     dependenceBounding(depSatBound[1]), in(i), out(o), forward(fwd) {}
+  // constexpr Dependence(Valid<DepPoly> poly,
+  //                      std::array<Valid<math::Simplex>, 2> depSatBound,
+  //                      Valid<IR::Addr> i, Valid<IR::Addr> o,
+  //                      std::array<uint8_t, 2> sL, bool fwd)
+  //   : depPoly(poly), dependenceSatisfaction(depSatBound[0]),
+  //     dependenceBounding(depSatBound[1]), in(i), out(o), satLvl(sL),
+  //     forward(fwd) {}
 
   /// stashSatLevel() -> Dependence &
   /// This is used to track sat levels in the LP recursion.
@@ -113,14 +135,19 @@ class Dependence {
   static constexpr auto satLevelMask(uint8_t slvl) -> uint8_t {
     return slvl & uint8_t(127); // NOTE: deduces to `int`
   }
+  // note that sat levels start at `0`, `0` meaning the outer most loop
+  // satisfies it. Thus, `satLevel() == 0` means the `depth == 1` loop satisfied
+  // it.
   [[nodiscard]] constexpr auto satLevel() const -> uint8_t {
     return satLevelMask(satLvl[0]);
   }
-  [[nodiscard]] constexpr auto isSat(unsigned depth) const -> bool {
+  /// `isSat` returns `true` on the level that satisfies it
+  [[nodiscard]] constexpr auto isSat(int depth) const -> bool {
     invariant(depth <= 127);
     return satLevel() <= depth;
   }
-  [[nodiscard]] constexpr auto isActive(unsigned depth) const -> bool {
+  /// `isActive` returns `false` on the level that satisfies it
+  [[nodiscard]] constexpr auto isActive(int depth) const -> bool {
     invariant(depth <= 127);
     return satLevel() > depth;
   }
@@ -147,7 +174,9 @@ class Dependence {
     return in->getArrayPointer();
   }
   /// indicates whether forward is non-empty
-  [[nodiscard]] constexpr auto isForward() const -> bool { return forward; }
+  /// Direction in simplex [x,y]: Forward ? x -> y : y -> x
+  /// i.e., is the simplex `[in, out]` (forward) or `[out, in]` (!forward)
+  [[nodiscard]] constexpr auto isForward() const -> bool { return meta & 1; }
   [[nodiscard]] constexpr auto nodeIn() const -> const lp::ScheduledNode * {
     return in->getNode();
   }
@@ -166,9 +195,9 @@ class Dependence {
     return in->indexMatrix();
   }
   // satisfies dep if it is empty when conditioning on inPhi and outPhi
-  void checkEmptySat(Arena<> *alloc, NotNull<const poly::Loop> inLoop,
+  void checkEmptySat(Arena<> *alloc, Valid<const poly::Loop> inLoop,
                      const int64_t *inOff, DensePtrMatrix<int64_t> inPhi,
-                     NotNull<const poly::Loop> outLoop, const int64_t *outOff,
+                     Valid<const poly::Loop> outLoop, const int64_t *outOff,
                      DensePtrMatrix<int64_t> outPhi) {
     if (!isForward()) {
       std::swap(inLoop, outLoop);
@@ -178,7 +207,7 @@ class Dependence {
     invariant(inPhi.numRow(), outPhi.numRow());
     if (depPoly->checkSat(*alloc, inLoop, inOff, inPhi, outLoop, outOff,
                           outPhi))
-      satLvl[0] = uint8_t(inPhi.numRow() - 1);
+      satLvl[0] = uint8_t(ptrdiff_t(inPhi.numRow()) - 1);
   }
   constexpr void copySimplices(Arena<> *alloc) {
     dependenceSatisfaction = dependenceSatisfaction->copy(alloc);
@@ -211,7 +240,8 @@ class Dependence {
     return out->getNaturalDepth();
   }
   [[nodiscard]] constexpr auto isInactive(size_t depth) const -> bool {
-    return (depth >= std::min(out->getCurrentDepth(), in->getCurrentDepth()));
+    return (depth >=
+            size_t(std::min(out->getCurrentDepth(), in->getCurrentDepth())));
   }
   [[nodiscard]] constexpr auto getNumLambda() const -> unsigned {
     return depPoly->getNumLambda() << 1;
@@ -241,14 +271,14 @@ class Dependence {
     // 2 == 1 for const offset + 1 for w
     assert(2 + depPoly->getNumLambda() + getNumPhiCoefficients() +
              getNumOmegaCoefficients() ==
-           size_t(dependenceSatisfaction->getConstraints().numCol()));
-  }
-  [[nodiscard]] constexpr auto getDepPoly() -> NotNull<DepPoly> {
-    return depPoly;
+           ptrdiff_t(dependenceSatisfaction->getConstraints().numCol()));
   }
-  [[nodiscard]] constexpr auto getDepPoly() const -> NotNull<const DepPoly> {
+  [[nodiscard]] constexpr auto getDepPoly() const -> Valid<DepPoly> {
     return depPoly;
   }
+  // [[nodiscard]] constexpr auto getDepPoly() const -> Valid<const DepPoly> {
+  //   return depPoly;
+  // }
   [[nodiscard]] constexpr auto getNumConstraints() const -> unsigned {
     return dependenceBounding->getNumCons() +
            dependenceSatisfaction->getNumCons();
@@ -266,52 +296,52 @@ class Dependence {
     return dependenceBounding->getConstraints();
   }
   [[nodiscard]] auto getSatLambda() const -> PtrMatrix<int64_t> {
-    return getSatConstraints()(_, _(1, 1 + depPoly->getNumLambda()));
+    return getSatConstraints()[_, _(1, 1 + depPoly->getNumLambda())];
   }
   [[nodiscard]] auto getBndLambda() const -> PtrMatrix<int64_t> {
-    return getBndConstraints()(_, _(1, 1 + depPoly->getNumLambda()));
+    return getBndConstraints()[_, _(1, 1 + depPoly->getNumLambda())];
   }
   [[nodiscard]] auto getSatPhiCoefs() const -> PtrMatrix<int64_t> {
     auto l = 3 + depPoly->getNumLambda();
-    return getSatConstraints()(_, _(l, l + getNumPhiCoefficients()));
+    return getSatConstraints()[_, _(l, l + getNumPhiCoefficients())];
   }
   [[nodiscard]] auto getSatPhi0Coefs() const -> PtrMatrix<int64_t> {
     auto l = 3 + depPoly->getNumLambda();
-    return getSatConstraints()(_, _(l, l + depPoly->getDim0()));
+    return getSatConstraints()[_, _(l, l + depPoly->getDim0())];
   }
   [[nodiscard]] auto getSatPhi1Coefs() const -> PtrMatrix<int64_t> {
     auto l = 3 + depPoly->getNumLambda() + depPoly->getDim0();
-    return getSatConstraints()(_, _(l, l + depPoly->getDim1()));
+    return getSatConstraints()[_, _(l, l + depPoly->getDim1())];
   }
   [[nodiscard]] auto getBndPhiCoefs() const -> PtrMatrix<int64_t> {
     auto l = 3 + depPoly->getNumLambda();
-    return getBndConstraints()(_, _(l, l + getNumPhiCoefficients()));
+    return getBndConstraints()[_, _(l, l + getNumPhiCoefficients())];
   }
   [[nodiscard]] auto getBndPhi0Coefs() const -> PtrMatrix<int64_t> {
     auto l = 3 + depPoly->getNumLambda();
-    return getBndConstraints()(_, _(l, l + depPoly->getDim0()));
+    return getBndConstraints()[_, _(l, l + depPoly->getDim0())];
   }
   [[nodiscard]] auto getBndPhi1Coefs() const -> PtrMatrix<int64_t> {
     auto l = 3 + depPoly->getNumLambda() + depPoly->getDim0();
-    return getBndConstraints()(_, _(l, l + depPoly->getDim1()));
+    return getBndConstraints()[_, _(l, l + depPoly->getDim1())];
   }
   [[nodiscard]] auto getSatOmegaCoefs() const -> PtrMatrix<int64_t> {
     auto l = 1 + depPoly->getNumLambda();
-    return getSatConstraints()(_, _(l, l + getNumOmegaCoefficients()));
+    return getSatConstraints()[_, _(l, l + getNumOmegaCoefficients())];
   }
   [[nodiscard]] auto getBndOmegaCoefs() const -> PtrMatrix<int64_t> {
     auto l = 1 + depPoly->getNumLambda();
-    return getBndConstraints()(_, _(l, l + getNumOmegaCoefficients()));
+    return getBndConstraints()[_, _(l, l + getNumOmegaCoefficients())];
   }
   [[nodiscard]] auto getSatW() const -> math::StridedVector<int64_t> {
-    return getSatConstraints()(_, 1 + depPoly->getNumLambda() +
+    return getSatConstraints()[_, 1 + depPoly->getNumLambda() +
                                     getNumPhiCoefficients() +
-                                    getNumOmegaCoefficients());
+                                    getNumOmegaCoefficients()];
   }
   [[nodiscard]] auto getBndCoefs() const -> PtrMatrix<int64_t> {
     size_t lb = 1 + depPoly->getNumLambda() + getNumPhiCoefficients() +
                 getNumOmegaCoefficients();
-    return getBndConstraints()(_, _(lb, end));
+    return getBndConstraints()[_, _(lb, end)];
   }
   [[nodiscard]] auto satPhiCoefs() const -> std::array<PtrMatrix<int64_t>, 2> {
     PtrMatrix<int64_t> phiCoefsIn = getSatPhi1Coefs(),
@@ -326,14 +356,14 @@ class Dependence {
     return {phiCoefsIn, phiCoefsOut};
   }
   [[nodiscard]] auto isSatisfied(Arena<> alloc,
-                                 NotNull<const AffineSchedule> schIn,
-                                 NotNull<const AffineSchedule> schOut) const
+                                 Valid<const AffineSchedule> schIn,
+                                 Valid<const AffineSchedule> schOut) const
     -> bool {
-    unsigned numLoopsIn = in->getCurrentDepth(),
-             numLoopsOut = out->getCurrentDepth(),
-             numLoopsCommon = std::min(numLoopsIn, numLoopsOut),
-             numLoopsTotal = numLoopsIn + numLoopsOut,
-             numVar = numLoopsIn + numLoopsOut + 2;
+    ptrdiff_t numLoopsIn = in->getCurrentDepth(),
+              numLoopsOut = out->getCurrentDepth(),
+              numLoopsCommon = std::min(numLoopsIn, numLoopsOut),
+              numLoopsTotal = numLoopsIn + numLoopsOut,
+              numVar = numLoopsIn + numLoopsOut + 2;
     invariant(dependenceSatisfaction->getNumVars(), numVar);
     auto schv = vector(&alloc, numVar, int64_t(0));
     const SquarePtrMatrix<int64_t> inPhi = schIn->getPhi();
@@ -362,8 +392,8 @@ class Dependence {
       // forward means offset is 2nd - 1st
       schv[0] = outOffOmega[i];
       schv[1] = inOffOmega[i];
-      schv[_(2, 2 + numLoopsIn)] << inPhi(last - i, _);
-      schv[_(2 + numLoopsIn, 2 + numLoopsTotal)] << outPhi(last - i, _);
+      schv[_(2, 2 + numLoopsIn)] << inPhi[last - i, _];
+      schv[_(2 + numLoopsIn, 2 + numLoopsTotal)] << outPhi[last - i, _];
       // dependenceSatisfaction is phi_t - phi_s >= 0
       // dependenceBounding is w + u'N - (phi_t - phi_s) >= 0
       // we implicitly 0-out `w` and `u` here,
@@ -379,16 +409,16 @@ class Dependence {
   [[nodiscard]] auto isSatisfied(Arena<> alloc, PtrVector<unsigned> inFusOmega,
                                  PtrVector<unsigned> outFusOmega) const
     -> bool {
-    unsigned numLoopsIn = in->getCurrentDepth(),
-             numLoopsOut = out->getCurrentDepth(),
-             numLoopsCommon = std::min(numLoopsIn, numLoopsOut),
-             numVar = numLoopsIn + numLoopsOut + 2;
+    ptrdiff_t numLoopsIn = in->getCurrentDepth(),
+              numLoopsOut = out->getCurrentDepth(),
+              numLoopsCommon = std::min(numLoopsIn, numLoopsOut),
+              numVar = numLoopsIn + numLoopsOut + 2;
     invariant(dependenceSatisfaction->getNumVars(), numVar);
     auto schv = vector(&alloc, numVar, int64_t(0));
     // Vector<int64_t> schv(dependenceSatisfaction->getNumVars(),int64_t(0));
     const unsigned numLambda = getNumLambda();
     // when i == numLoopsCommon, we've passed the last loop
-    for (size_t i = 0; i <= numLoopsCommon; ++i) {
+    for (ptrdiff_t i = 0; i <= numLoopsCommon; ++i) {
       if (int64_t o2idiff = outFusOmega[i] - inFusOmega[i])
         return (o2idiff > 0);
       // we should not be able to reach `numLoopsCommon`
@@ -420,10 +450,9 @@ class Dependence {
     }
     return true;
   }
-  [[nodiscard]] auto isSatisfied(Arena<> alloc,
-                                 NotNull<const AffineSchedule> sx,
-                                 NotNull<const AffineSchedule> sy,
-                                 size_t d) const -> bool {
+  [[nodiscard]] auto isSatisfied(Arena<> alloc, Valid<const AffineSchedule> sx,
+                                 Valid<const AffineSchedule> sy, size_t d) const
+    -> bool {
     unsigned numLambda = depPoly->getNumLambda(), nLoopX = depPoly->getDim0(),
              nLoopY = depPoly->getDim1(), numLoopsTotal = nLoopX + nLoopY;
     MutPtrVector<int64_t> sch{math::vector<int64_t>(&alloc, numLoopsTotal + 2)};
@@ -435,9 +464,9 @@ class Dependence {
     return dependenceSatisfaction->satisfiable(alloc, sch, numLambda);
   }
   [[nodiscard]] auto isSatisfied(Arena<> alloc, size_t d) const -> bool {
-    unsigned numLambda = depPoly->getNumLambda(),
-             numLoopsX = depPoly->getDim0(),
-             numLoopsTotal = numLoopsX + depPoly->getDim1();
+    ptrdiff_t numLambda = depPoly->getNumLambda(),
+              numLoopsX = depPoly->getDim0(),
+              numLoopsTotal = numLoopsX + depPoly->getDim1();
     MutPtrVector<int64_t> sch{math::vector<int64_t>(&alloc, numLoopsTotal + 2)};
     sch << 0;
     invariant(sch.size(), numLoopsTotal + 2);
@@ -446,18 +475,6 @@ class Dependence {
     return dependenceSatisfaction->satisfiable(alloc, sch, numLambda);
   }
 
-  struct Active {
-    unsigned depth;
-    constexpr Active(const Active &) noexcept = default;
-    constexpr Active(Active &&) noexcept = default;
-    constexpr Active() noexcept = default;
-    constexpr auto operator=(const Active &) noexcept -> Active & = default;
-    constexpr Active(unsigned depth) : depth(depth) {}
-    constexpr auto operator()(const Dependence *d) const -> bool {
-      return d->isActive(depth);
-    }
-  };
-
   friend inline auto operator<<(llvm::raw_ostream &os, const Dependence &d)
     -> llvm::raw_ostream & {
     os << "Dependence Poly ";
@@ -548,72 +565,134 @@ static_assert(sizeof(Dependence) <= 64);
 // i_0 = i_1
 // j_0 = j_1 - k_1
 class Dependencies {
-  char *data{nullptr};
-  int32_t numData{0};
-  // int32_t tombstone{-1};
+  using Tuple =
+    containers::Tuple<IR::Addr *, IR::Addr *,
+                      std::array<Valid<math::Simplex>, 2>, DepPoly *, int32_t,
+                      int32_t, int32_t, int32_t, int32_t,
+                      std::array<uint8_t, 2>, uint8_t, uint8_t>;
+
+  math::ManagedSOA<Tuple> datadeps;
+
+  static constexpr size_t OutI = 0;
+  static constexpr size_t InI = 1;
+  static constexpr size_t SimplexPairI = 2;
+  static constexpr size_t DepPolyI = 3;
+  static constexpr size_t NextEdgeOutI = 4;
+  static constexpr size_t PrevEdgeOutI = 5;
+  static constexpr size_t NextEdgeInI = 6;
+  static constexpr size_t PrevEdgeInI = 7;
+  static constexpr size_t RevTimeEdgeI = 8;
+  static constexpr size_t SatLevelI = 9;
+  static constexpr size_t GetMetaI = 10;
+  static constexpr size_t GetPeelI = 11;
 
 public:
   using ID = Dependence::ID;
-  constexpr Dependencies() noexcept = default;
-  constexpr Dependencies(Arena<> *alloc)
-    : data(alloc->allocate<char>(memNeeded(64))) {}
-  constexpr Dependencies(const Dependencies &) noexcept = default; // or delete?
-  constexpr Dependencies(Dependencies &&) noexcept = default;      // or delete?
-  constexpr auto operator=(Dependencies &&other) noexcept
-    -> Dependencies & = default;
-  constexpr auto operator=(const Dependencies &other) noexcept
-    -> Dependencies & = default;
+  Dependencies(ptrdiff_t len) : datadeps(len) {}
+  Dependencies(const Dependencies &) noexcept = delete;
+  constexpr Dependencies(Dependencies &&) noexcept = default;
+  constexpr auto operator=(Dependencies &&other) noexcept -> Dependencies & {
+    datadeps = std::move(other.datadeps);
+    return *this;
+  };
 
-  [[nodiscard]] constexpr auto size() const noexcept -> int32_t {
-    return numData;
+  [[nodiscard]] constexpr auto size() const noexcept -> ptrdiff_t {
+    return datadeps.size();
   }
 
 private:
-  void addEdge(Arena<> *alloc, Dependence d) {
-    int32_t id = size();
-    push_pack(alloc, d);
-    d.input()->setEdgeOut(id);
-    d.output()->setEdgeIn(id);
-  }
-  static constexpr auto memNeeded(size_t N) -> size_t {
-    constexpr size_t memPer = sizeof(int32_t) * 2 + sizeof(DepPoly *) +
-                              sizeof(math::Simplex *) * 2 + sizeof(bool) +
-                              sizeof(uint8_t);
-    return N * memPer;
+  constexpr auto tup(Dependence d, int32_t i) -> Tuple {
+    IR::Addr *out = d.output(), *in = d.input();
+    if (out->getEdgeOut() >= 0) prevOut(ID{out->getEdgeOut()}) = i;
+    if (in->getEdgeIn() >= 0) prevIn(ID{in->getEdgeIn()}) = i;
+    in->setEdgeOut(i);
+    out->setEdgeIn(i);
+    return Tuple{out,
+                 in,
+                 d.getSimplexPair(),
+                 d.getDepPoly(),
+                 out->getEdgeOut(),
+                 -1,
+                 in->getEdgeIn(),
+                 -1,
+                 d.revTimeEdge().id,
+                 d.satLvl,
+                 d.getMeta(),
+                 d.getPeel()};
   }
 
-  void timelessCheck(Arena<> *alloc, NotNull<DepPoly> dxy, NotNull<IR::Addr> x,
-                     NotNull<IR::Addr> y,
-                     std::array<NotNull<math::Simplex>, 2> pair, bool isFwd) {
-    const size_t numLambda = dxy->getNumLambda();
-    invariant(dxy->getTimeDim(), unsigned(0));
+  /// set(ID i, Dependence d)
+  /// stores `d` at index `i`
+  /// Dependence `d` is pushed to the fronts of the edgeOut and edgeIn chains.
+  constexpr void set(int32_t i, Dependence d) { datadeps[i] = tup(d, i); }
+  constexpr void set(ID i, Dependence d) { set(i.id, d); }
+  auto addEdge(Dependence d) -> ID {
+    int32_t id{int32_t(datadeps.size())};
+    invariant(id >= 0);
+    datadeps.push_back(tup(d, id));
+    return {int32_t(id)};
+  }
+
+  void addOrdered(Valid<poly::DepPoly> dxy, Valid<IR::Addr> x,
+                  Valid<IR::Addr> y, std::array<Valid<math::Simplex>, 2> pair,
+                  bool isFwd) {
+    ptrdiff_t numLambda = dxy->getNumLambda();
     if (!isFwd) {
       std::swap(pair[0], pair[1]);
       std::swap(x, y);
     }
     pair[0]->truncateVars(1 + numLambda + dxy->getNumScheduleCoef());
-    addEdge(alloc, Dependence{dxy, pair, x, y, isFwd});
-  }
-  void timelessCheck(Arena<> *alloc, NotNull<DepPoly> dxy, NotNull<IR::Addr> x,
-                     NotNull<IR::Addr> y,
-                     std::array<NotNull<math::Simplex>, 2> pair) {
-    return timelessCheck(alloc, dxy, x, y, pair,
-                         checkDirection(*alloc, pair, x, y, dxy->getNumLambda(),
-                                        dxy->getNumVar() + 1));
+    addEdge(Dependence{.depPoly = dxy,
+                       .dependenceSatisfaction = pair[0],
+                       .dependenceBounding = pair[1],
+                       .in = x,
+                       .out = y,
+                       .meta = isFwd});
+  }
+  void timelessCheck(Arena<> *alloc, Valid<DepPoly> dxy, Valid<IR::Addr> x,
+                     Valid<IR::Addr> y,
+                     std::array<Valid<math::Simplex>, 2> pair) {
+    invariant(dxy->getTimeDim(), unsigned(0));
+    return addOrdered(dxy, x, y, pair,
+                      checkDirection(*alloc, pair, x, y, dxy->getNumLambda(),
+                                     Col<>{dxy->getNumVar() + 1}));
   }
 
   // emplaces dependencies with repeat accesses to the same memory across
   // time
-  void timeCheck(Arena<> *alloc, NotNull<DepPoly> dxy, NotNull<IR::Addr> x,
-                 NotNull<IR::Addr> y,
-                 std::array<NotNull<math::Simplex>, 2> pair) {
-    bool isFwd = checkDirection(*alloc, pair, x, y, dxy->getNumLambda(),
-                                dxy->getA().numCol() - dxy->getTimeDim());
+  void timeCheck(Arena<> *alloc, Valid<DepPoly> dxy, Valid<IR::Addr> x,
+                 Valid<IR::Addr> y, std::array<Valid<math::Simplex>, 2> pair) {
+    bool isFwd = checkDirection(
+      *alloc, pair, x, y, dxy->getNumLambda(),
+      Col<>{ptrdiff_t(dxy->getA().numCol()) - dxy->getTimeDim()});
     timeCheck(alloc, dxy, x, y, pair, isFwd);
   }
-  void timeCheck(Arena<> *alloc, NotNull<DepPoly> dxy, NotNull<IR::Addr> x,
-                 NotNull<IR::Addr> y,
-                 std::array<NotNull<math::Simplex>, 2> pair, bool isFwd) {
+  static void timeStep(Valid<DepPoly> dxy, MutPtrMatrix<int64_t> fE,
+                       MutPtrMatrix<int64_t> sE,
+                       ptrdiff_t numInequalityConstraintsOld,
+                       ptrdiff_t numEqualityConstraintsOld, ptrdiff_t ineqEnd,
+                       ptrdiff_t posEqEnd, ptrdiff_t v, ptrdiff_t step) {
+    for (ptrdiff_t c = 0; c < numInequalityConstraintsOld; ++c) {
+      int64_t Acv = dxy->getA(Row<>{c}, Col<>{v});
+      if (!Acv) continue;
+      Acv *= step;
+      fE[0, c + 1] -= Acv; // *1
+      sE[0, c + 1] -= Acv; // *1
+    }
+    for (ptrdiff_t c = 0; c < numEqualityConstraintsOld; ++c) {
+      // each of these actually represents 2 inds
+      int64_t Ecv = dxy->getE(Row<>{c}, Col<>{v});
+      if (!Ecv) continue;
+      Ecv *= step;
+      fE[0, c + ineqEnd] -= Ecv;
+      fE[0, c + posEqEnd] += Ecv;
+      sE[0, c + ineqEnd] -= Ecv;
+      sE[0, c + posEqEnd] += Ecv;
+    }
+  }
+  void timeCheck(Arena<> *alloc, Valid<DepPoly> dxy, Valid<IR::Addr> x,
+                 Valid<IR::Addr> y, std::array<Valid<math::Simplex>, 2> pair,
+                 bool isFwd) {
     const unsigned numInequalityConstraintsOld =
                      dxy->getNumInequalityConstraints(),
                    numEqualityConstraintsOld = dxy->getNumEqualityConstraints(),
@@ -623,9 +702,9 @@ class Dependencies {
                    numScheduleCoefs = dxy->getNumScheduleCoef();
     invariant(numLambda, dxy->getNumLambda());
     // copy backup
-    std::array<NotNull<math::Simplex>, 2> farkasBackups{pair[0]->copy(alloc),
-                                                        pair[1]->copy(alloc)};
-    NotNull<IR::Addr> in = x, out = y;
+    std::array<Valid<math::Simplex>, 2> farkasBackups{pair[0]->copy(alloc),
+                                                      pair[1]->copy(alloc)};
+    Valid<IR::Addr> in = x, out = y;
     if (isFwd) {
       std::swap(farkasBackups[0], farkasBackups[1]);
     } else {
@@ -633,10 +712,15 @@ class Dependencies {
       std::swap(pair[0], pair[1]);
     }
     pair[0]->truncateVars(1 + numLambda + numScheduleCoefs);
-    auto dep0 = Dependence{dxy->copy(alloc), pair, in, out, isFwd};
-    invariant(out->getCurrentDepth() + in->getCurrentDepth(),
-              dep0.getNumPhiCoefficients());
-    addEdge(alloc, dep0);
+    Dependence dep0{.depPoly = dxy->copy(alloc),
+                    .dependenceSatisfaction = pair[0],
+                    .dependenceBounding = pair[1],
+                    .in = in,
+                    .out = out,
+                    .meta = isFwd};
+    invariant(ptrdiff_t(out->getCurrentDepth()) + in->getCurrentDepth(),
+              ptrdiff_t(dep0.getNumPhiCoefficients()));
+    ID d0ID{addEdge(dep0)}, prevID = d0ID;
     // pair is invalid
     const ptrdiff_t timeDim = dxy->getTimeDim(),
                     numVar = 1 + dxy->getNumVar() - timeDim;
@@ -646,90 +730,63 @@ class Dependencies {
     // dep0.depPoly->truncateVars(numVar);
 
     // dep0.depPoly->setTimeDim(0);
-    invariant(out->getCurrentDepth() + in->getCurrentDepth(),
-              dep0.getNumPhiCoefficients());
+    invariant(ptrdiff_t(out->getCurrentDepth()) + in->getCurrentDepth(),
+              ptrdiff_t(dep0.getNumPhiCoefficients()));
     // now we need to check the time direction for all times
-    // anything approaching 16 time dimensions would be absolutely
-    // insane
-    math::Vector<bool, 16> timeDirection(timeDim);
-    ptrdiff_t t = 0;
-    auto fE{farkasBackups[0]->getConstraints()(_, _(1, end))};
-    auto sE{farkasBackups[1]->getConstraints()(_, _(1, end))};
-    do {
+    // anything approaching 16 time dimensions would be insane
+    for (ptrdiff_t t = 0;;) {
       // set `t`th timeDim to +1/-1
       // basically, what we do here is set it to `step` and pretend it was
       // a constant. so a value of c = a'x + t*step -> c - t*step = a'x so
       // we update the constant `c` via `c -= t*step`.
       // we have the problem that.
       int64_t step = dxy->getNullStep(t);
-      ptrdiff_t v = numVar + t, i = 0;
-      while (true) {
-        for (ptrdiff_t c = 0; c < numInequalityConstraintsOld; ++c) {
-          int64_t Acv = dxy->getA(c, v);
-          if (!Acv) continue;
-          Acv *= step;
-          fE(0, c + 1) -= Acv; // *1
-          sE(0, c + 1) -= Acv; // *1
-        }
-        for (ptrdiff_t c = 0; c < numEqualityConstraintsOld; ++c) {
-          // each of these actually represents 2 inds
-          int64_t Ecv = dxy->getE(c, v);
-          if (!Ecv) continue;
-          Ecv *= step;
-          fE(0, c + ineqEnd) -= Ecv;
-          fE(0, c + posEqEnd) += Ecv;
-          sE(0, c + ineqEnd) -= Ecv;
-          sE(0, c + posEqEnd) += Ecv;
-        }
-        if (i++ != 0) break; // break after undoing
-        timeDirection[t] =
-          checkDirection(*alloc, farkasBackups, *out, *in, numLambda,
-                         dxy->getA().numCol() - dxy->getTimeDim());
-        step *= -1; // flip to undo, then break
-      }
-    } while (++t < timeDim);
-    t = 0;
-    do {
-      // checkDirection(farkasBackups, x, y, numLambda) == false
-      // correct time direction would make it return true
-      // thus sign = timeDirection[t] ? 1 : -1
-      int64_t step = (2 * timeDirection[t] - 1) * dxy->getNullStep(t);
       ptrdiff_t v = numVar + t;
-      for (ptrdiff_t c = 0; c < numInequalityConstraintsOld; ++c) {
-        int64_t Acv = dxy->getA(c, v);
-        if (!Acv) continue;
-        Acv *= step;
-        dxy->getA(c, 0) -= Acv;
-        fE(0, c + 1) -= Acv; // *1
-        sE(0, c + 1) -= Acv; // *-1
-      }
-      for (ptrdiff_t c = 0; c < numEqualityConstraintsOld; ++c) {
-        // each of these actually represents 2 inds
-        int64_t Ecv = dxy->getE(c, v);
-        if (!Ecv) continue;
-        Ecv *= step;
-        dxy->getE(c, 0) -= Ecv;
-        fE(0, c + ineqEnd) -= Ecv;
-        fE(0, c + posEqEnd) += Ecv;
-        sE(0, c + ineqEnd) -= Ecv;
-        sE(0, c + posEqEnd) += Ecv;
+      bool repeat = (++t < timeDim);
+      std::array<Valid<math::Simplex>, 2> fp{farkasBackups};
+      if (repeat) {
+        fp[0] = fp[0]->copy(alloc);
+        fp[1] = fp[1]->copy(alloc);
       }
-    } while (++t < timeDim);
-    // dxy->truncateVars(numVar);
-    // dxy->setTimeDim(0);
-    farkasBackups[0]->truncateVars(1 + numLambda + numScheduleCoefs);
-    auto dep1 = Dependence{dxy, farkasBackups, out, in, !isFwd};
-    invariant(out->getCurrentDepth() + in->getCurrentDepth(),
-              dep1.getNumPhiCoefficients());
-    addEdge(alloc, dep1);
+      // set (or unset) for this timedim
+      auto fE{fp[0]->getConstraints()[_, _(1, end)]};
+      auto sE{fp[1]->getConstraints()[_, _(1, end)]};
+      timeStep(dxy, fE, sE, numInequalityConstraintsOld,
+               numEqualityConstraintsOld, ineqEnd, posEqEnd, v, step);
+      // checkDirection should be `true`, so if `false` we flip the sign
+      // this is because `isFwd = checkDirection` of the original
+      // `if (isFwd)`, we swapped farkasBackups args, making the result
+      // `false`; for our timeDim to capture the opposite movement
+      // through time, we thus need to flip it back to `true`.
+      // `if (!isFwd)`, i.e. the `else` branch above, we don't flip the
+      // args, so it'd still return `false` and a flip would still mean `true`.
+      if (!checkDirection(
+            *alloc, fp, *out, *in, numLambda,
+            Col<>{ptrdiff_t(dxy->getA().numCol()) - dxy->getTimeDim()}))
+        timeStep(dxy, fE, sE, numInequalityConstraintsOld,
+                 numEqualityConstraintsOld, ineqEnd, posEqEnd, v, -2 * step);
+
+      fp[0]->truncateVars(1 + numLambda + numScheduleCoefs);
+      Dependence dep1{.depPoly = dxy,
+                      .dependenceSatisfaction = farkasBackups[0],
+                      .dependenceBounding = farkasBackups[1],
+                      .in = out,
+                      .out = in,
+                      .revTimeEdge_ = prevID,
+                      .meta = !isFwd};
+      invariant(ptrdiff_t(out->getCurrentDepth()) + in->getCurrentDepth(),
+                ptrdiff_t(dep1.getNumPhiCoefficients()));
+      prevID = addEdge(dep1);
+      if (!repeat) break;
+    }
+    revTimeEdge(d0ID) = prevID.id;
   }
   static auto checkDirection(Arena<> alloc,
-                             const std::array<NotNull<math::Simplex>, 2> &p,
-                             NotNull<const IR::Addr> x,
-                             NotNull<const IR::Addr> y,
-                             NotNull<const AffineSchedule> xSchedule,
-                             NotNull<const AffineSchedule> ySchedule,
-                             unsigned numLambda, Col nonTimeDim) -> bool {
+                             const std::array<Valid<math::Simplex>, 2> &p,
+                             Valid<const IR::Addr> x, Valid<const IR::Addr> y,
+                             Valid<const AffineSchedule> xSchedule,
+                             Valid<const AffineSchedule> ySchedule,
+                             ptrdiff_t numLambda, Col<> nonTimeDim) -> bool {
     const auto &[fxy, fyx] = p;
     unsigned numLoopsX = x->getCurrentDepth(), numLoopsY = y->getCurrentDepth(),
              numLoopsTotal = numLoopsX + numLoopsY;
@@ -760,16 +817,16 @@ class Dependencies {
       assert(i != numLoopsCommon);
       sch[0] = xOffOmega[i];
       sch[1] = yOffOmega[i];
-      sch[_(2, 2 + numLoopsX)] << xPhi(last - i, _);
-      sch[_(2 + numLoopsX, 2 + numLoopsTotal)] << yPhi(last - i, _);
+      sch[_(2, 2 + numLoopsX)] << xPhi[last - i, _];
+      sch[_(2 + numLoopsX, 2 + numLoopsTotal)] << yPhi[last - i, _];
       if (fxy->unSatisfiableZeroRem(alloc, sch, numLambda,
-                                    unsigned(nonTimeDim))) {
+                                    ptrdiff_t(nonTimeDim))) {
         assert(!fyx->unSatisfiableZeroRem(alloc, sch, numLambda,
-                                          unsigned(nonTimeDim)));
+                                          ptrdiff_t(nonTimeDim)));
         return false;
       }
       if (fyx->unSatisfiableZeroRem(alloc, sch, numLambda,
-                                    unsigned(nonTimeDim)))
+                                    ptrdiff_t(nonTimeDim)))
         return true;
     }
     // assert(false);
@@ -777,14 +834,14 @@ class Dependencies {
   }
   // returns `true` if forward, x->y
   static auto checkDirection(Arena<> alloc,
-                             const std::array<NotNull<math::Simplex>, 2> &p,
-                             NotNull<const IR::Addr> x,
-                             NotNull<const IR::Addr> y, unsigned numLambda,
-                             Col nonTimeDim) -> bool {
+                             const std::array<Valid<math::Simplex>, 2> &p,
+                             Valid<const IR::Addr> x, Valid<const IR::Addr> y,
+                             ptrdiff_t numLambda, Col<> nonTimeDim) -> bool {
     const auto &[fxy, fyx] = p;
-    unsigned numLoopsX = x->getCurrentDepth(), nTD = unsigned(nonTimeDim);
+    unsigned numLoopsX = x->getCurrentDepth(), nTD = ptrdiff_t(nonTimeDim);
 #ifndef NDEBUG
-    const unsigned numLoopsCommon = std::min(numLoopsX, y->getCurrentDepth());
+    ptrdiff_t numLoopsCommon =
+      std::min(ptrdiff_t(numLoopsX), ptrdiff_t(y->getCurrentDepth()));
 #endif
     PtrVector<int64_t> xFusOmega = x->getFusionOmega();
     PtrVector<int64_t> yFusOmega = y->getFusionOmega();
@@ -813,231 +870,190 @@ class Dependencies {
     invariant(false);
     return false;
   }
-  constexpr auto get(ID i, IR::Addr *in, IR::Addr *out) -> Dependence {
-    return Dependence{depPoly(i),      depSatBnd(i), in, out,
-                      satLevelPair(i), isForward(i)
+  constexpr auto get(ID i, IR::Addr *in, IR::Addr *out) const -> Dependence {
+    auto [depSat, depBnd] = depSatBnd(i);
+    return Dependence{.depPoly = depPoly(i),
+                      .dependenceSatisfaction = depSat,
+                      .dependenceBounding = depBnd,
+                      .in = in,
+                      .out = out,
+                      .satLvl = satLevelPair(i),
+                      .meta = getMeta(i)
 
     };
   }
-
-  constexpr void set(ID i, Dependence d) {
-    auto out = d.output();
-    auto in = d.input();
-    output(i) = out;
-    nextOut(i) = out->getEdgeOut();
-    input(i) = in;
-    nextIn(i) = in->getEdgeIn();
-    depSatBnd(i) = d.getSimplexPair();
-    depPoly(i) = d.getDepPoly();
-    satLevelPair(i) = d.satLvl;
-    isForward(i) = d.isForward();
-  }
-
-  auto push_pack(Arena<> *alloc, Dependence d) -> void * {
-    void *ret = nullptr;
-    if (numData == getCapacity()) {
-      auto newCapacity = getCapacity() * 2;
-      auto *newData = alloc->allocate<char>(memNeeded(newCapacity));
-      std::memcpy(newData, data, memNeeded(numData));
-      ret = std::exchange(data, newData);
+  static auto innermostNonZero(PtrMatrix<int64_t> A, ptrdiff_t skip)
+    -> ptrdiff_t {
+    for (ptrdiff_t i = ptrdiff_t(A.numCol()); --i;) {
+      if (i == skip) continue;
+      if (!math::allZero(A[_, i])) return i;
     }
-    set(ID{numData++}, d);
-    return ret;
-  }
-  [[nodiscard]] constexpr auto getCapacity() const noexcept -> int32_t {
-    return int32_t(std::bit_ceil(uint32_t(numData)));
-  }
-
-  constexpr auto outAddrPtr() -> IR::Addr ** {
-    void *p = data;
-    return static_cast<IR::Addr **>(p);
-  }
-  [[nodiscard]] constexpr auto outAddrPtr() const -> IR::Addr *const * {
-    const void *p = data;
-    return static_cast<IR::Addr *const *>(p);
-  }
-  constexpr auto inAddrPtr() -> IR::Addr ** {
-    void *p = data + sizeof(IR::Addr *) * getCapacity();
-    return static_cast<IR::Addr **>(p);
-  }
-  [[nodiscard]] constexpr auto inAddrPtr() const -> IR::Addr *const * {
-    const void *p = data + sizeof(IR::Addr *) * getCapacity();
-    return static_cast<IR::Addr *const *>(p);
-  }
-  constexpr auto outEdgePtr() -> int32_t * {
-    unsigned cap = getCapacity();
-    void *p = data + sizeof(IR::Addr *) * 2 * cap;
-    return static_cast<int32_t *>(p);
-  }
-  [[nodiscard]] constexpr auto outEdgePtr() const -> const int32_t * {
-    unsigned cap = getCapacity();
-    const void *p = data + sizeof(IR::Addr *) * 2 * cap;
-    return static_cast<const int32_t *>(p);
-  }
-  constexpr auto inEdgePtr() -> int32_t * {
-    unsigned cap = getCapacity();
-    void *p = data + (sizeof(IR::Addr *) * 2 + sizeof(int32_t)) * cap;
-    return static_cast<int32_t *>(p);
-  }
-  [[nodiscard]] constexpr auto inEdgePtr() const -> const int32_t * {
-    unsigned cap = getCapacity();
-    const void *p = data + (sizeof(IR::Addr *) * 2 + sizeof(int32_t)) * cap;
-    return static_cast<const int32_t *>(p);
-  }
-  constexpr auto satLevelsPtr() -> std::array<uint8_t, 2> * {
-    unsigned cap = getCapacity();
-    void *p =
-      data +
-      ((sizeof(IR::Addr *) + sizeof(int32_t) + sizeof(math::Simplex *)) * 2 +
-       sizeof(DepPoly *)) *
-        cap;
-    return static_cast<std::array<uint8_t, 2> *>(p);
-  }
-  [[nodiscard]] constexpr auto satLevelsPtr() const
-    -> const std::array<uint8_t, 2> * {
-    unsigned cap = getCapacity();
-    const void *p =
-      data +
-      ((sizeof(IR::Addr *) + sizeof(int32_t) + sizeof(math::Simplex *)) * 2 +
-       sizeof(DepPoly *)) *
-        cap;
-    return static_cast<const std::array<uint8_t, 2> *>(p);
+    return -1;
   }
 
 public:
-  // field order:
-  // AddrOut
-  // AddrIn
-  // nextOut
-  // nextIn
-  // dependenceSatisfaction
-  // dependenceBounding
-  // depPoly
-  // satLevel
-  // isForward
-  constexpr auto get(ID i) -> Dependence { return get(i, input(i), output(i)); }
-  constexpr auto outAddrs() -> MutPtrVector<IR::Addr *> {
-    return {outAddrPtr(), numData};
-  }
-  constexpr auto inAddrs() -> MutPtrVector<IR::Addr *> {
-    return {inAddrPtr(), numData};
-  }
+  constexpr void removeEdge(ID id) {
+    removeOutEdge(id.id);
+    removeInEdge(id.id);
+    /// TODO: remove revTimeEdge?
+  }
+  constexpr void removeOutEdge(int32_t id) {
+    int32_t prev = prevOut(poly::Dependence::ID{id});
+    int32_t next = nextOut(poly::Dependence::ID{id});
+    if (prev >= 0) nextOut(poly::Dependence::ID{prev}) = next;
+    if (next >= 0) prevOut(poly::Dependence::ID{next}) = prev;
+  }
+  constexpr void removeInEdge(int32_t id) {
+    int32_t prev = prevIn(poly::Dependence::ID{id});
+    int32_t next = nextIn(poly::Dependence::ID{id});
+    if (prev >= 0) nextIn(poly::Dependence::ID{prev}) = next;
+    if (next >= 0) prevIn(poly::Dependence::ID{next}) = prev;
+  }
+  [[nodiscard]] constexpr auto get(ID i) const -> Dependence {
+    return get(i, input(i), output(i));
+  }
+  // constexpr auto outAddrs() -> MutPtrVector<IR::Addr *> {
+  //   return {outAddrPtr(), numData};
+  // }
+  // constexpr auto inAddrs() -> MutPtrVector<IR::Addr *> {
+  //   return {inAddrPtr(), numData};
+  // }
   constexpr auto outEdges() -> MutPtrVector<int32_t> {
-    return {outEdgePtr(), numData};
+    return datadeps.template get<NextEdgeOutI>();
   }
   constexpr auto inEdges() -> MutPtrVector<int32_t> {
-    return {inEdgePtr(), numData};
+    return datadeps.template get<NextEdgeInI>();
   }
   [[nodiscard]] constexpr auto outEdges() const -> PtrVector<int32_t> {
-    return {outEdgePtr(), unsigned(numData)};
+    return datadeps.template get<NextEdgeOutI>();
   }
   [[nodiscard]] constexpr auto inEdges() const -> PtrVector<int32_t> {
-    return {inEdgePtr(), unsigned(numData)};
-  }
-  constexpr auto satLevels() -> MutPtrVector<std::array<uint8_t, 2>> {
-    return {satLevelsPtr(), numData};
+    return datadeps.template get<NextEdgeInI>();
   }
+  // [[nodiscard]] constexpr auto outEdges() const -> PtrVector<int32_t> {
+  //   return {outEdgePtr(), unsigned(numData)};
+  // }
+  // [[nodiscard]] constexpr auto inEdges() const -> PtrVector<int32_t> {
+  //   return {inEdgePtr(), unsigned(numData)};
+  // }
+  // constexpr auto satLevels() -> MutPtrVector<std::array<uint8_t, 2>> {
+  //   return {satLevelsPtr(), numData};
+  // }
 
   [[nodiscard]] constexpr auto output(ID i) -> IR::Addr *& {
-    return outAddrPtr()[i.id];
+    return datadeps.template get<OutI>(i.id);
   }
-  [[nodiscard]] constexpr auto output(ID i) const -> const IR::Addr * {
-    return outAddrPtr()[i.id];
+  [[nodiscard]] constexpr auto output(ID i) const -> IR::Addr * {
+    return datadeps.template get<OutI>(i.id);
   }
   [[nodiscard]] constexpr auto input(ID i) -> IR::Addr *& {
-    return inAddrPtr()[i.id];
+    return datadeps.template get<InI>(i.id);
   }
-  [[nodiscard]] constexpr auto input(ID i) const -> const IR::Addr * {
-    return inAddrPtr()[i.id];
+  [[nodiscard]] constexpr auto input(ID i) const -> IR::Addr * {
+    return datadeps.template get<InI>(i.id);
   }
   constexpr auto nextOut(ID i) -> int32_t & {
-    unsigned cap = getCapacity();
-    void *p = data + sizeof(int32_t) * i.id + sizeof(IR::Addr *) * 2 * cap;
-    return *static_cast<int32_t *>(p);
+    return datadeps.template get<NextEdgeOutI>(i.id);
+  }
+  constexpr auto prevOut(ID i) -> int32_t & {
+    return datadeps.template get<PrevEdgeOutI>(i.id);
   }
   constexpr auto nextIn(ID i) -> int32_t & {
-    unsigned cap = getCapacity();
-    void *p = data + sizeof(int32_t) * i.id +
-              (sizeof(IR::Addr *) * 2 + sizeof(int32_t)) * cap;
-    return *static_cast<int32_t *>(p);
+    return datadeps.template get<NextEdgeInI>(i.id);
+  }
+  constexpr auto prevIn(ID i) -> int32_t & {
+    return datadeps.template get<PrevEdgeInI>(i.id);
+  }
+  constexpr auto depSatBnd(ID i) -> std::array<Valid<math::Simplex>, 2> & {
+    return datadeps.template get<SimplexPairI>(i.id);
   }
-  constexpr auto depSatBnd(ID i) -> std::array<NotNull<math::Simplex>, 2> & {
-    unsigned cap = getCapacity();
-    void *p = data + 2 * sizeof(math::Simplex *) * i.id +
-              (sizeof(IR::Addr *) + sizeof(int32_t)) * 2 * cap;
-    return *static_cast<std::array<NotNull<math::Simplex>, 2> *>(p);
+  constexpr auto revTimeEdge(ID i) -> int32_t & {
+    return datadeps.template get<RevTimeEdgeI>(i.id);
+  }
+  [[nodiscard]] constexpr auto revTimeEdge(ID i) const -> int32_t {
+    return datadeps.template get<RevTimeEdgeI>(i.id);
   }
   constexpr auto depPoly(ID i) -> DepPoly *& {
-    unsigned cap = getCapacity();
-    void *p = data + sizeof(DepPoly *) * i.id +
-              (sizeof(IR::Addr *) + sizeof(int32_t) + sizeof(math::Simplex *)) *
-                2 * cap;
-    return *static_cast<DepPoly **>(p);
+    return datadeps.template get<DepPolyI>(i.id);
+  }
+  [[nodiscard]] constexpr auto depSatBnd(ID i) const
+    -> std::array<Valid<math::Simplex>, 2> {
+    return datadeps.template get<SimplexPairI>(i.id);
+  }
+  [[nodiscard]] constexpr auto depPoly(ID i) const -> DepPoly * {
+    return datadeps.template get<DepPolyI>(i.id);
   }
   constexpr auto satLevelPair(ID i) -> std::array<uint8_t, 2> & {
-    return satLevelsPtr()[i.id];
+    return datadeps.template get<SatLevelI>(i.id);
   }
   [[nodiscard]] constexpr auto satLevelPair(ID i) const
-    -> const std::array<uint8_t, 2> & {
-    return satLevelsPtr()[i.id];
+    -> std::array<uint8_t, 2> {
+    return datadeps.template get<SatLevelI>(i.id);
   }
-  constexpr auto satLevel(ID i) -> uint8_t {
-    auto pair = satLevelPair(i);
-    return Dependence::satLevelMask(pair[0]);
+  [[nodiscard]] constexpr auto satLevel(ID i) const -> uint8_t {
+    return Dependence::satLevelMask(satLevelPair(i)[0]);
   }
   [[nodiscard]] constexpr auto isSat(ID i, unsigned depth) const -> uint8_t {
-    auto pair = satLevelPair(i);
-    return Dependence::satLevelMask(pair[0]) <= depth;
+    return Dependence::satLevelMask(satLevelPair(i)[0]) <= depth;
+  }
+  [[nodiscard]] constexpr auto isActive(ID i, unsigned depth) const -> uint8_t {
+    return Dependence::satLevelMask(satLevelPair(i)[0]) > depth;
   }
 
-  [[nodiscard]] constexpr auto isForward(ID i) const noexcept -> bool & {
-    unsigned cap = getCapacity();
-    void *p =
-      data + sizeof(bool) * i.id +
-      ((sizeof(IR::Addr *) + sizeof(int32_t) + sizeof(math::Simplex *)) * 2 +
-       sizeof(std::array<uint8_t, 2>) + sizeof(DepPoly *)) *
-        cap;
-    return *static_cast<bool *>(p);
+  [[nodiscard]] constexpr auto getMeta(ID i) noexcept -> uint8_t & {
+    return datadeps.template get<GetMetaI>(i.id);
+  }
+  [[nodiscard]] constexpr auto getMeta(ID i) const noexcept -> uint8_t {
+    return datadeps.template get<GetMetaI>(i.id);
+  }
+  [[nodiscard]] constexpr auto getPeel(ID i) noexcept -> uint8_t & {
+    return datadeps.template get<GetPeelI>(i.id);
+  }
+  [[nodiscard]] constexpr auto getPeel(ID i) const noexcept -> uint8_t {
+    return datadeps.template get<GetPeelI>(i.id);
+  }
+  [[nodiscard]] constexpr auto isForward(ID i) const noexcept -> bool {
+    return getMeta(i) & 1;
   }
 
   class Ref {
-    Dependencies *deps;
-    ID i;
+    Dependencies *deps_;
+    ID i_;
 
   public:
-    Ref(Dependencies *deps, ID i) : deps(deps), i(i) {}
-    operator Dependence() const { return deps->get(i); }
+    constexpr Ref(Dependencies *deps, ID i) : deps_(deps), i_(i) {}
+    operator Dependence() const { return deps_->get(i_); }
     auto operator=(Dependence d) -> Ref & {
-      deps->set(i, d);
+      deps_->set(i_, d);
       return *this;
     }
   };
 
-  void check(Arena<> *alloc, NotNull<IR::Addr> x, NotNull<IR::Addr> y) {
+  void check(Arena<> *alloc, Valid<IR::Addr> x, Valid<IR::Addr> y) {
     // TODO: implement gcd test
     // if (x.gcdKnownIndependent(y)) return {};
     DepPoly *dxy{DepPoly::dependence(alloc, x, y)};
     if (!dxy) return;
-    invariant(x->getCurrentDepth(), dxy->getDim0());
-    invariant(y->getCurrentDepth(), dxy->getDim1());
-    invariant(x->getCurrentDepth() + y->getCurrentDepth(),
-              dxy->getNumPhiCoef());
+    invariant(x->getCurrentDepth() == ptrdiff_t(dxy->getDim0()));
+    invariant(y->getCurrentDepth() == ptrdiff_t(dxy->getDim1()));
+    invariant(x->getCurrentDepth() + y->getCurrentDepth() ==
+              ptrdiff_t(dxy->getNumPhiCoef()));
     // note that we set boundAbove=true, so we reverse the
     // dependence direction for the dependency we week, we'll
     // discard the program variables x then y
-    std::array<NotNull<math::Simplex>, 2> pair(dxy->farkasPair(alloc));
+    std::array<Valid<math::Simplex>, 2> pair(dxy->farkasPair(alloc));
     if (dxy->getTimeDim()) timeCheck(alloc, dxy, x, y, pair);
     else timelessCheck(alloc, dxy, x, y, pair);
   }
-  inline void copyDependencies(Arena<> *alloc, IR::Addr *src, IR::Addr *dst);
+  inline void copyDependencies(IR::Addr *src, IR::Addr *dst);
   // reload store `x`
-  auto reload(Arena<> *alloc, NotNull<IR::Addr> store) -> NotNull<IR::Addr> {
-    NotNull<DepPoly> dxy{DepPoly::self(alloc, store)};
-    std::array<NotNull<math::Simplex>, 2> pair(dxy->farkasPair(alloc));
-    NotNull<IR::Addr> load = store->reload(alloc);
-    copyDependencies(alloc, store, load);
+  auto reload(Arena<> *alloc, Valid<IR::Addr> store) -> Valid<IR::Addr> {
+    Valid<DepPoly> dxy{DepPoly::self(alloc, store)};
+    std::array<Valid<math::Simplex>, 2> pair(dxy->farkasPair(alloc));
+    Valid<IR::Addr> load = store->reload(alloc);
+    copyDependencies(store, load);
     if (dxy->getTimeDim()) timeCheck(alloc, dxy, store, load, pair, true);
-    else timelessCheck(alloc, dxy, store, load, pair, true);
+    else addOrdered(dxy, store, load, pair, true);
     return load;
   }
   [[nodiscard]] constexpr auto inputEdgeIDs(int32_t id) const {
@@ -1046,96 +1062,210 @@ class Dependencies {
   [[nodiscard]] constexpr auto outputEdgeIDs(int32_t id) const {
     return utils::VForwardRange{outEdges(), id};
   }
+  [[nodiscard]] constexpr auto getEdgeTransform() const {
+    auto f = [=, this](int32_t id) { return get(Dependence::ID{id}); };
+    return std::views::transform(f);
+  }
   [[nodiscard]] constexpr auto inputEdges(int32_t id) const {
-    auto f = [this](int32_t id) {
-      Dependencies d = *this;
-      return d.get(ID{id});
-    };
-    return inputEdgeIDs(id) | std::views::transform(f);
+    return inputEdgeIDs(id) | getEdgeTransform();
   }
   [[nodiscard]] constexpr auto outputEdges(int32_t id) const {
-    auto f = [this](int32_t id) {
-      Dependencies d = *this;
-      return d.get(Dependence::ID{id});
-    };
-    return outputEdgeIDs(id) | std::views::transform(f);
+    return outputEdgeIDs(id) | getEdgeTransform();
   }
 
-  [[nodiscard]] constexpr auto activeFilter(unsigned depth) const {
-    auto f = [=](int32_t id) -> bool {
-      return !isSat(Dependence::ID{id}, depth);
+  [[nodiscard]] constexpr auto activeFilter(int depth) const {
+    auto f = [=, this](int32_t id) -> bool {
+      return isActive(Dependence::ID{id}, depth);
     };
     return std::views::filter(f);
   }
   [[nodiscard]] constexpr auto inputAddrTransform() {
-    auto f = [=](int32_t id) { return input(Dependence::ID{id}); };
+    auto f = [=, this](int32_t id) { return input(Dependence::ID{id}); };
     return std::views::transform(f);
   }
   [[nodiscard]] constexpr auto outputAddrTransform() {
-    auto f = [=](int32_t id) { return output(Dependence::ID{id}); };
+    auto f = [=, this](int32_t id) { return output(Dependence::ID{id}); };
     return std::views::transform(f);
   }
+  [[nodiscard]] constexpr auto inputAddrTransform() const {
+    auto f = [=, this](int32_t id) { return input(Dependence::ID{id}); };
+    return std::views::transform(f);
+  }
+  [[nodiscard]] constexpr auto outputAddrTransform() const {
+    auto f = [=, this](int32_t id) { return output(Dependence::ID{id}); };
+    return std::views::transform(f);
+  }
+  /// this function essentially indicates that this dependency does not prevent
+  /// the hoisting of a memory access out of a loop, because a memory->register
+  /// transform is possible.
+  /// The requirements are that the `indexMatrix` match
+  [[nodiscard]] constexpr auto registerEligible(ID id) const -> bool {
+    /// If no repeated accesses across time, it can't be hoisted out
+    if (revTimeEdge(id) < 0) return false;
+    DensePtrMatrix<int64_t> inMat{input(id)->indexMatrix()},
+      outMat{output(id)->indexMatrix()};
+    ptrdiff_t numLoopsIn = ptrdiff_t(inMat.numCol()),
+              numLoopsOut = ptrdiff_t(outMat.numCol()),
+              numLoops = std::min(numLoopsIn, numLoopsOut);
+    if ((numLoopsIn != numLoopsOut) &&
+        math::anyNEZero(numLoopsIn > numLoopsOut
+                          ? inMat[_, _(numLoopsOut, numLoopsIn)]
+                          : outMat[_, _(numLoopsIn, numLoopsOut)]))
+      return false;
+    return inMat[_, _(0, numLoops)] == outMat[_, _(0, numLoops)];
+  }
+  [[nodiscard]] constexpr auto registerEligibleFilter() const {
+    auto f = [=, this](int32_t id) -> bool {
+      return registerEligible(Dependence::ID{id});
+    };
+    return std::views::filter(f);
+  }
+  /// NOTE: this method uses `in` and `out` to check for reorderability, as
+  /// these get rotated after the simplex solve, while the stored `DepPoly` and
+  /// simplices do not.
+  inline auto determinePeelDepth(IR::Loop *, int32_t)
+    -> utils::Optional<size_t>;
 };
 
-static_assert(std::is_trivially_copyable_v<Dependencies>);
-static_assert(std::is_trivially_destructible_v<Dependencies>);
 } // namespace poly
 namespace IR {
 using poly::Dependencies;
 
-inline auto Addr::inputEdges(Dependencies deps) const {
+inline auto Addr::inputEdges(const Dependencies &deps) const {
   return deps.inputEdges(getEdgeIn());
 }
-inline auto Addr::outputEdges(Dependencies deps) const {
+inline auto Addr::outputEdges(const Dependencies &deps) const {
   return deps.outputEdges(getEdgeOut());
 }
-inline auto Addr::inputEdgeIDs(Dependencies deps) const {
+inline auto Addr::inputEdgeIDs(const Dependencies &deps) const
+  -> utils::VForwardRange {
   return deps.inputEdgeIDs(getEdgeIn());
 }
-inline auto Addr::outputEdgeIDs(Dependencies deps) const {
+inline auto Addr::outputEdgeIDs(const Dependencies &deps) const
+  -> utils::VForwardRange {
   return deps.outputEdgeIDs(getEdgeOut());
 }
+inline auto Addr::inputEdgeIDs(const Dependencies &deps, int depth) const {
+  return inputEdgeIDs(deps) | deps.activeFilter(depth);
+}
+inline auto Addr::outputEdgeIDs(const Dependencies &deps, int depth) const {
+  return outputEdgeIDs(deps) | deps.activeFilter(depth);
+}
 
-inline auto IR::Addr::inputAddrs(Dependencies deps) const {
+inline auto IR::Addr::inputAddrs(const Dependencies &deps) const {
   return inputEdgeIDs(deps) | deps.inputAddrTransform();
 }
-inline auto IR::Addr::outputAddrs(Dependencies deps) const {
+inline auto IR::Addr::outputAddrs(const Dependencies &deps) const {
   return outputEdgeIDs(deps) | deps.outputAddrTransform();
 }
-
-inline auto Addr::inputEdges(Dependencies deps, unsigned depth) const {
-  return inputEdgeIDs(deps) | deps.activeFilter(depth);
+inline auto Addr::inputEdges(const Dependencies &deps, int depth) const {
+  return inputEdgeIDs(deps) | deps.activeFilter(depth) |
+         deps.getEdgeTransform();
 }
-inline auto Addr::outputEdges(Dependencies deps, unsigned depth) const {
-  return outputEdgeIDs(deps) | deps.activeFilter(depth);
+inline auto Addr::outputEdges(const Dependencies &deps, int depth) const {
+  return outputEdgeIDs(deps) | deps.activeFilter(depth) |
+         deps.getEdgeTransform();
 }
-
-inline auto IR::Addr::inputAddrs(Dependencies deps, unsigned depth) const {
-  return inputEdges(deps, depth) | deps.inputAddrTransform();
+inline auto IR::Addr::inputAddrs(const Dependencies &deps, int depth) const {
+  return inputEdgeIDs(deps, depth) | deps.inputAddrTransform();
 }
-inline auto IR::Addr::outputAddrs(Dependencies deps, unsigned depth) const {
-  return outputEdges(deps, depth) | deps.outputAddrTransform();
+inline auto IR::Addr::outputAddrs(const Dependencies &deps, int depth) const {
+  return outputEdgeIDs(deps, depth) | deps.outputAddrTransform();
+}
+inline auto IR::Addr::unhoistableOutputs(const Dependencies &deps,
+                                         int depth) const {
+  return outputEdgeIDs(deps, depth) | deps.registerEligibleFilter() |
+         deps.outputAddrTransform();
+}
+
+/// Addr::operator->(const Dependencies& deps)
+/// drop `this` from the graph, and remove it from `deps`
+inline void IR::Addr::drop(Dependencies &deps) {
+  // NOTE: this doesn't get removed from the `origAddr` list/the addrChain
+  if (IR::Loop *L = getLoop(); L->getChild() == this) L->setChild(getNext());
+  removeFromList();
+  for (int32_t id : inputEdgeIDs(deps)) deps.removeEdge(Dependence::ID{id});
+  for (int32_t id : outputEdgeIDs(deps)) deps.removeEdge(Dependence::ID{id});
 }
 
+using math::StridedVector;
 } // namespace IR
 
 namespace poly {
-inline void Dependencies::copyDependencies(Arena<> *alloc, IR::Addr *src,
-                                           IR::Addr *dst) {
+inline void Dependencies::copyDependencies(IR::Addr *src, IR::Addr *dst) {
   for (int32_t id : src->inputEdgeIDs(*this)) {
     IR::Addr *input = this->input(Dependence::ID{id});
     if (input->isLoad()) continue;
     Dependence d = get(Dependence::ID{id}, input, dst);
-    addEdge(alloc, d);
+    addEdge(d);
   }
   for (int32_t id : src->outputEdgeIDs(*this)) {
     IR::Addr *output = this->output(Dependence::ID{id});
     if (output->isLoad()) continue;
     Dependence d = get(Dependence::ID{id}, dst, output);
-    addEdge(alloc, d);
+    addEdge(d);
   }
 }
 
+// returns `true` if this dependence can be reordered due to peelinng, `false`
+// otherwise note that the associated loop itself may need scalarization, but
+// subloop evaluations could be reorderable How would we capture
+// dependencies/uses like
+// int64_t x = 0;
+// for (ptrdiff_t m = 0; m < M; ++m){
+//   x += a[m];
+//   b[m] = x;
+// }
+// we have `x +=` as a reassociable self-dependence, but the fact it is stored
+// into `b[m]` means that we can't really reassociate, as each nominal
+// intermediate value of `x` must be realized!
+// We must check that there are no other reads. Note that this is represented as
+// int64_t x[1]{};
+// for (ptrdiff_t m = 0; m < M; ++m){
+//   x[0] = x[0] + a[m];
+//   b[m] = x[0];
+// }
+// So we have write->read dependence for the store `x[0] =` to the read in
+// `b[m] = x[0]`. The key observation here is that `x[0]` has a time component;
+// the violation occurs because we store in another location, providing a
+// non-reassociable component.
+inline auto Dependencies::determinePeelDepth(IR::Loop *L, int32_t id)
+  -> utils::Optional<size_t> {
+  auto id_ = Dependence::ID{id};
+  IR::Addr *in = input(id_), *out = output(id_);
+  // clang-format off
+  // If we have a dependency nested inside `L`, we won't be able to reorder if either
+  // a) that dependency's output is `in`
+  // b) that dependency's input is `out`
+  // as we'd then have to maintain the order of this loop level's evaluations with respect
+  // to the subloop.
+  // Otherwise, we check
+  // 1. If this dependency may be peeled. For this, it must
+  //   a) be indexed by both `L` and a subloop of `L`.
+  //   b) have an equality relation, so that it occurs for a single iteration fo the subloop.
+  //   Then, we can split the subloop across this value, scalarizing around it.
+  // 2. Is this dependency reassociable? E.g., if it's connected by reassociable adds
+  //   (such as integer adds, or floating point with the reassociable FMF), then mark it as such.
+  // clang-format on
+  //
+  // if (anyInteriorDependencies(L, in) || anyInteriorDependents(L, out))
+  //   return false;
+  // no inner dependence
+  PtrMatrix<int64_t> inInd = in->indexMatrix(), outInd = out->indexMatrix();
+  invariant(inInd.numRow(), outInd.numRow());
+  ptrdiff_t d = L->getCurrentDepth();
+  invariant(inInd.numRow() >= d);
+  bool noInIndAtDepth = math::allZero(inInd[_, d]),
+       noOutIndAtDepth = math::allZero(outInd[_, d]);
+  if (noInIndAtDepth == noOutIndAtDepth) return -1;
+  // now, we want to find a loop that `in` depends on but `out` does not
+  // so that we can split over this loop.
+  // For now, to simplify codegen, we only accept the innermost non-zero
+  ptrdiff_t i = innermostNonZero(noInIndAtDepth ? inInd : outInd, d);
+  if (i >= 0) getPeel(id_) = i;
+  return i >= 0 ? utils::Optional<size_t>{size_t(i)}
+                : utils::Optional<size_t>{};
+}
 } // namespace poly
 
 } // namespace poly
diff --git a/include/Polyhedra/DependencyPolyhedra.hpp b/include/Polyhedra/DependencyPolyhedra.hpp
index df4c6f26d..1acdec1ab 100644
--- a/include/Polyhedra/DependencyPolyhedra.hpp
+++ b/include/Polyhedra/DependencyPolyhedra.hpp
@@ -1,24 +1,22 @@
-
 #pragma once
 
 #include "IR/Address.hpp"
 #include "Polyhedra/Loops.hpp"
 #include "Polyhedra/Polyhedra.hpp"
 #include "Support/OStream.hpp"
+#include <Alloc/Arena.hpp>
 #include <Math/Array.hpp>
 #include <Math/Comparisons.hpp>
 #include <Math/Math.hpp>
 #include <Math/NormalForm.hpp>
 #include <Math/Orthogonalize.hpp>
 #include <Math/Simplex.hpp>
-#include <Utilities/Allocators.hpp>
 #include <Utilities/Valid.hpp>
 #include <algorithm>
 #include <array>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
-#include <llvm/ADT/Optional.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Analysis/ScalarEvolution.h>
 #include <llvm/Support/Allocator.h>
@@ -27,20 +25,21 @@
 #include <utility>
 
 namespace poly::poly {
+using math::shape;
 /// prints in current permutation order.
 /// TODO: decide if we want to make poly::Loop a `SymbolicPolyhedra`
 /// in which case, we have to remove `currentToOriginalPerm`,
-/// which menas either change printing, or move prints `<<` into
+/// which means either change printing, or move prints `<<` into
 /// the derived classes.
 inline auto printConstraints(std::ostream &os, DensePtrMatrix<int64_t> A,
                              llvm::ArrayRef<const llvm::SCEV *> syms,
                              bool inequality = true) -> std::ostream & {
-  const Row numConstraints = A.numRow();
-  const unsigned numSyms = syms.size() + 1;
-  for (Row c = 0; c < numConstraints; ++c) {
-    printConstraint(os, A(c, _), numSyms, inequality);
+  Row numConstraints = A.numRow();
+  unsigned numSyms = syms.size() + 1;
+  for (ptrdiff_t c = 0; c < numConstraints; ++c) {
+    printConstraint(os, A[c, _], numSyms, inequality);
     for (ptrdiff_t v = 1; v < numSyms; ++v) {
-      if (int64_t Acv = A(c, v)) {
+      if (int64_t Acv = A[c, v]) {
         os << (Acv > 0 ? " + " : " - ");
         Acv = math::constexpr_abs(Acv);
         if (Acv != 1) os << Acv << "*";
@@ -181,12 +180,12 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
   constexpr void decrementNumConstraints() { invariant(numCon-- > 0); }
   constexpr auto getA() -> MutDensePtrMatrix<int64_t> {
     void *p = memory;
-    return {(int64_t *)p, math::DenseDims{numCon, getNumVar() + 1}};
+    return {(int64_t *)p, math::DenseDims<>{{numCon}, {getNumVar() + 1}}};
   }
   constexpr auto getE() -> MutDensePtrMatrix<int64_t> {
     void *p = memory;
     return {(int64_t *)p + size_t(conCapacity) * (getNumVar() + 1),
-            math::DenseDims{numEqCon, getNumVar() + 1}};
+            math::DenseDims<>{{numEqCon}, {getNumVar() + 1}}};
   }
   constexpr auto getNullStep() -> math::MutPtrVector<int64_t> {
     void *p = memory;
@@ -211,28 +210,28 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
   [[nodiscard]] auto getA() const -> DensePtrMatrix<int64_t> {
     const char *p = memory;
     return {const_cast<int64_t *>(reinterpret_cast<const int64_t *>(p)),
-            math::DenseDims{numCon, getNumVar() + 1}};
+            math::DenseDims<>{{numCon}, {getNumVar() + 1}}};
   }
-  [[nodiscard]] auto getA(Row r, Col c) -> int64_t & {
+  [[nodiscard]] auto getA(Row<> r, Col<> c) -> int64_t & {
     auto *p = reinterpret_cast<int64_t *>(memory);
-    return p[size_t(r) * (getNumVar() + 1) + size_t(c)];
+    return p[ptrdiff_t(r) * (getNumVar() + 1) + ptrdiff_t(c)];
   }
-  [[nodiscard]] auto getA(Row r, Col c) const -> int64_t {
+  [[nodiscard]] auto getA(Row<> r, Col<> c) const -> int64_t {
     const auto *p = reinterpret_cast<const int64_t *>(memory);
-    return p[size_t(r) * (getNumVar() + 1) + size_t(c)];
+    return p[ptrdiff_t(r) * (getNumVar() + 1) + ptrdiff_t(c)];
   }
   [[nodiscard]] auto getE() const -> DensePtrMatrix<int64_t> {
     const auto *p = reinterpret_cast<const int64_t *>(memory);
     return {const_cast<int64_t *>(p + size_t(conCapacity) * (getNumVar() + 1)),
-            math::DenseDims{numEqCon, getNumVar() + 1}};
+            math::DenseDims<>{numEqCon, getNumVar() + 1}};
   }
-  [[nodiscard]] auto getE(Row r, Col c) -> int64_t & {
+  [[nodiscard]] auto getE(Row<> r, Col<> c) -> int64_t & {
     auto *p = reinterpret_cast<int64_t *>(memory);
-    return p[(conCapacity + size_t(r)) * (getNumVar() + 1) + size_t(c)];
+    return p[(conCapacity + ptrdiff_t(r)) * (getNumVar() + 1) + ptrdiff_t(c)];
   }
-  [[nodiscard]] auto getE(Row r, Col c) const -> int64_t {
+  [[nodiscard]] auto getE(Row<> r, Col<> c) const -> int64_t {
     const auto *p = reinterpret_cast<const int64_t *>(memory);
-    return p[(conCapacity + size_t(r)) * (getNumVar() + 1) + size_t(c)];
+    return p[(conCapacity + ptrdiff_t(r)) * (getNumVar() + 1) + ptrdiff_t(c)];
   }
   [[nodiscard]] auto getNullStep() const -> PtrVector<int64_t> {
     const auto *p = reinterpret_cast<const int64_t *>(memory);
@@ -249,56 +248,56 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
       numDynSym};
   }
   auto getSymbols(ptrdiff_t i) -> math::MutPtrVector<int64_t> {
-    return getA()(i, _(math::begin, getNumSymbols()));
+    return getA()[i, _(math::begin, getNumSymbols())];
   }
   [[nodiscard]] auto getInEqSymbols(ptrdiff_t i) const -> PtrVector<int64_t> {
-    return getA()(i, _(math::begin, getNumSymbols()));
+    return getA()[i, _(math::begin, getNumSymbols())];
   }
   [[nodiscard]] auto getEqSymbols(ptrdiff_t i) const -> PtrVector<int64_t> {
-    return getE()(i, _(math::begin, getNumSymbols()));
+    return getE()[i, _(math::begin, getNumSymbols())];
   }
   [[nodiscard]] auto getCompTimeInEqOffset(ptrdiff_t i) const
     -> std::optional<int64_t> {
-    if (!allZero(getA()(i, _(1, getNumSymbols())))) return {};
-    return getA()(i, 0);
+    if (!allZero(getA()[i, _(1, getNumSymbols())])) return {};
+    return getA()[i, 0];
   }
   [[nodiscard]] auto getCompTimeEqOffset(ptrdiff_t i) const
     -> std::optional<int64_t> {
-    if (!allZero(getE()(i, _(1, getNumSymbols())))) return {};
-    return getE()(i, 0);
+    if (!allZero(getE()[i, _(1, getNumSymbols())])) return {};
+    return getE()[i, 0];
   }
   static constexpr auto findFirstNonEqual(PtrVector<int64_t> x,
                                           PtrVector<int64_t> y) -> ptrdiff_t {
     return std::distance(
       x.begin(), std::mismatch(x.begin(), x.end(), y.begin(), y.end()).first);
   }
-  static auto nullSpace(NotNull<const IR::Addr> x, NotNull<const IR::Addr> y)
+  static auto nullSpace(Valid<const IR::Addr> x, Valid<const IR::Addr> y)
     -> math::DenseMatrix<int64_t> {
     unsigned numLoopsCommon =
                findFirstNonEqual(x->getFusionOmega(), y->getFusionOmega()),
              xDim = x->getArrayDim(), yDim = y->getArrayDim();
-    math::DenseMatrix<int64_t> A(math::DenseDims{numLoopsCommon, xDim + yDim});
+    math::DenseMatrix<int64_t> A(
+      math::DenseDims<>{numLoopsCommon, xDim + yDim});
     if (!numLoopsCommon) return A;
     // indMats cols are [outerMostLoop,...,innerMostLoop]
     PtrMatrix<int64_t> indMatX = x->indexMatrix(), indMatY = y->indexMatrix();
     unsigned indDepth = std::min(x->getNaturalDepth(), y->getNaturalDepth());
     for (ptrdiff_t i = 0; i < std::min(numLoopsCommon, indDepth); ++i) {
-      A(i, _(0, xDim)) << indMatX(_, i);
-      A(i, _(xDim, end)) << indMatY(_, i);
+      A[i, _(0, xDim)] << indMatX[_, i];
+      A[i, _(xDim, end)] << indMatY[_, i];
     }
-    for (ptrdiff_t i = indDepth; i < numLoopsCommon; ++i) A(i, _) << 0;
+    for (ptrdiff_t i = indDepth; i < numLoopsCommon; ++i) A[i, _] << 0;
     // returns rank x num loops
     return orthogonalNullSpace(std::move(A));
   }
-  static auto nullSpace(NotNull<const IR::Addr> x)
-    -> math::DenseMatrix<int64_t> {
+  static auto nullSpace(Valid<const IR::Addr> x) -> math::DenseMatrix<int64_t> {
     unsigned numLoopsCommon = x->getCurrentDepth(), dim = x->getArrayDim(),
              natDepth = x->getNaturalDepth();
-    math::DenseMatrix<int64_t> A(math::DenseDims{numLoopsCommon, dim});
+    math::DenseMatrix<int64_t> A(math::DenseDims<>{numLoopsCommon, dim});
     if (!numLoopsCommon) return A;
     // indMats cols are [outerMostLoop,...,innerMostLoop]
-    A(_(0, natDepth), _) << x->indexMatrix().transpose();
-    if (natDepth < numLoopsCommon) A(_(natDepth, end), _) << 0;
+    A[_(0, natDepth), _] << x->indexMatrix().t();
+    if (natDepth < numLoopsCommon) A[_(natDepth, end), _] << 0;
     // returns rank x num loops
     return orthogonalNullSpace(std::move(A));
   }
@@ -340,19 +339,19 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
              ((conCapacity + eqConCapacity) * (getNumVar() + 1) + timeDim) +
            sizeof(const llvm::SCEV *) * numDynSym;
   }
-  auto copy(Arena<> *alloc) const -> NotNull<DepPoly> {
+  auto copy(Arena<> *alloc) const -> Valid<DepPoly> {
     auto *p = alloc->template allocate<DepPoly>(neededBytes());
     std::memcpy(p, this, neededBytes());
-    return NotNull<DepPoly>{p};
+    return Valid<DepPoly>{p};
   }
-  static auto dependence(Arena<> *alloc, NotNull<const IR::Addr> aix,
-                         NotNull<const IR::Addr> aiy) -> DepPoly * {
+  static auto dependence(Arena<> *alloc, Valid<const IR::Addr> aix,
+                         Valid<const IR::Addr> aiy) -> DepPoly * {
     assert(aix->sizesMatch(aiy));
     unsigned numDep0Var = aix->getCurrentDepth(),
              numDep1Var = aiy->getCurrentDepth(),
              numVar = numDep0Var + numDep1Var;
-    NotNull<const poly::Loop> loopx = aix->getAffLoop();
-    NotNull<const poly::Loop> loopy = aiy->getAffLoop();
+    Valid<const poly::Loop> loopx = aix->getAffLoop();
+    Valid<const poly::Loop> loopy = aiy->getAffLoop();
     PtrMatrix<int64_t> Ax{loopx->getOuterA(numDep0Var)},
       Ay{loopy->getOuterA(numDep1Var)};
     auto Sx{loopx->getSyms()}, Sy{loopy->getSyms()};
@@ -362,18 +361,18 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     invariant(Cx.numRow(), Cy.numRow());
     invariant(Cx.numCol() <= numDep0Var);
     invariant(Cy.numCol() <= numDep1Var);
-    auto [nc0, nv0] = Ax.size();
-    auto [nc1, nv1] = Ay.size();
+    auto [nc0, nv0] = shape(Ax);
+    auto [nc1, nv1] = shape(Ay);
 
     math::Vector<unsigned> map;
     unsigned numDynSym = mergeMap(map, Sx, Sy);
     invariant(ptrdiff_t(map.size()), ptrdiff_t(Sy.size()));
     unsigned numSym = numDynSym + 1;
     math::DenseMatrix<int64_t> NS{nullSpace(aix, aiy)};
-    unsigned timeDim = unsigned{NS.numRow()},
-             numCols = numVar + timeDim + numDynSym + 1,
-             conCapacity = unsigned(Ax.numRow() + Ay.numRow()) + numVar,
-             eqConCapacity = unsigned(Cx.numRow()) + timeDim;
+    ptrdiff_t timeDim = ptrdiff_t{NS.numRow()},
+              numCols = numVar + timeDim + numDynSym + 1,
+              conCapacity = ptrdiff_t(Ax.numRow() + Ay.numRow()) + numVar,
+              eqConCapacity = ptrdiff_t(Cx.numRow()) + timeDim;
 
     size_t memNeeded =
       sizeof(int64_t) * ((conCapacity + eqConCapacity) * numCols + timeDim) +
@@ -385,10 +384,10 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
                                  timeDim, conCapacity, eqConCapacity);
 
     // numDep1Var = nv1;
-    Row nc = nc0 + nc1;
+    ptrdiff_t nc = nc0 + nc1;
     unsigned indexDim{aix->getArrayDim()};
     auto nullStep{dp->getNullStep()};
-    for (ptrdiff_t i = 0; i < timeDim; ++i) nullStep[i] = selfDot(NS(i, _));
+    for (ptrdiff_t i = 0; i < timeDim; ++i) nullStep[i] = norm2(NS[i, _]);
     //           column meansing in in order
     // const size_t numSymbols = getNumSymbols();
     auto A{dp->getA()};
@@ -399,38 +398,38 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     // E.resize(indexDim + nullDim, A.numCol());
     // ma0 loop
     for (ptrdiff_t i = 0; i < nc0; ++i) {
-      A(i, _(0, 1 + Sx.size())) << Ax(i, _(0, 1 + Sx.size()));
-      A(i, _(numSym, numSym + numDep0Var))
-        << Ax(i, _(1 + Sx.size(), 1 + Sx.size() + numDep0Var));
+      A[i, _(0, 1 + Sx.size())] << Ax[i, _(0, 1 + Sx.size())];
+      A[i, _(numSym, numSym + numDep0Var)]
+        << Ax[i, _(1 + Sx.size(), 1 + Sx.size() + numDep0Var)];
     }
     for (ptrdiff_t i = 0; i < nc1; ++i) {
-      A(nc0 + i, 0) = Ay(i, 0);
+      A[nc0 + i, 0] = Ay[i, 0];
       for (ptrdiff_t j = 0; j < map.size(); ++j)
-        A(nc0 + i, 1 + map[j]) = Ay(i, 1 + j);
+        A[nc0 + i, 1 + map[j]] = Ay[i, 1 + j];
       for (ptrdiff_t j = 0; j < numDep1Var; ++j)
-        A(nc0 + i, j + numSym + numDep0Var) = Ay(i, j + 1 + Sy.size());
+        A[nc0 + i, j + numSym + numDep0Var] = Ay[i, j + 1 + Sy.size()];
     }
-    A(_(nc, end), _(numSym, numSym + numVar)).diag() << 1;
+    A[_(nc, end), _(numSym, numSym + numVar)].diag() << 1;
     // indMats are [outerMostLoop, ..., innerMostLoop] x arrayDim
     // offsetMats are arrayDim x numSymbols
     // E(i,:)* indVars = q[i]
     // e.g. i_0 + j_0 + off_0 = i_1 + j_1 + off_1
     // i_0 + j_0 - i_1 - j_1 = off_1 - off_0
     for (ptrdiff_t i = 0; i < indexDim; ++i) {
-      E(i, _(0, Ox.numCol())) << Ox(i, _);
-      E(i, _(0, Cx.numCol()) + numSym) << Cx(i, _);
-      E(i, 0) -= Oy(i, 0);
-      for (ptrdiff_t j = 0; j < Oy.numCol() - 1; ++j)
-        E(i, 1 + map[j]) -= Oy(i, 1 + j);
-      E(i, _(0, Cy.numCol()) + numSym + numDep0Var) << -Cy(i, _);
+      E[i, _(0, Ox.numCol())] << Ox[i, _];
+      E[i, _(0, Cx.numCol()) + numSym] << Cx[i, _];
+      E[i, 0] -= Oy[i, 0];
+      for (ptrdiff_t j = 0, J = ptrdiff_t(Oy.numCol()) - 1; j < J; ++j)
+        E[i, 1 + map[j]] -= Oy[i, 1 + j];
+      E[i, _(0, Cy.numCol()) + numSym + numDep0Var] << -Cy[i, _];
     }
     for (ptrdiff_t i = 0; i < timeDim; ++i) {
       for (ptrdiff_t j = 0; j < NS.numCol(); ++j) {
-        int64_t nsij = NS(i, j);
-        E(indexDim + i, j + numSym) = nsij;
-        E(indexDim + i, j + numSym + numDep0Var) = -nsij;
+        int64_t nsij = NS[i, j];
+        E[indexDim + i, j + numSym] = nsij;
+        E[indexDim + i, j + numSym + numDep0Var] = -nsij;
       }
-      E(indexDim + i, numSym + numVar + i) = 1;
+      E[indexDim + i, numSym + numVar + i] = 1;
     }
     dp->pruneBounds(*alloc);
     if (dp->getNumCon()) return dp;
@@ -438,22 +437,21 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     return nullptr;
   }
   // self dependence
-  static auto self(Arena<> *alloc, NotNull<const IR::Addr> ai)
-    -> NotNull<DepPoly> {
-    NotNull<const poly::Loop> loop = ai->getAffLoop();
+  static auto self(Arena<> *alloc, Valid<const IR::Addr> ai) -> Valid<DepPoly> {
+    Valid<const poly::Loop> loop = ai->getAffLoop();
     unsigned numDepVar = ai->getCurrentDepth(), numVar = numDepVar + numDepVar;
     PtrMatrix<int64_t> B{loop->getOuterA(numDepVar)};
     auto S{loop->getSyms()};
     // numLoops x numDim
     PtrMatrix<int64_t> C{ai->indexMatrix()}, O{ai->offsetMatrix()};
 
-    auto [nco, nv] = B.size();
+    auto [nco, nv] = shape(B);
     math::DenseMatrix<int64_t> NS{nullSpace(ai)};
-    unsigned numDynSym = S.size(), numSym = numDynSym + 1,
-             timeDim = unsigned{NS.numRow()},
-             numCols = numVar + timeDim + numDynSym + 1,
-             conCapacity = unsigned(2 * B.numRow()) + numVar,
-             eqConCapacity = unsigned(C.numRow()) + timeDim;
+    ptrdiff_t numDynSym = ptrdiff_t(S.size()), numSym = numDynSym + 1,
+              timeDim = ptrdiff_t{NS.numRow()},
+              numCols = numVar + timeDim + numDynSym + 1,
+              conCapacity = 2 * ptrdiff_t(B.numRow()) + numVar,
+              eqConCapacity = ptrdiff_t(C.numRow()) + timeDim;
 
     size_t memNeeded =
       sizeof(int64_t) * ((conCapacity + eqConCapacity) * numCols + timeDim) +
@@ -464,10 +462,10 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
                                  conCapacity, eqConCapacity);
 
     // numDep1Var = nv1;
-    Row nc = nco + nco;
+    ptrdiff_t nc = nco + nco;
     unsigned indexDim{ai->getArrayDim()};
     auto nullStep{dp->getNullStep()};
-    for (ptrdiff_t i = 0; i < timeDim; ++i) nullStep[i] = selfDot(NS(i, _));
+    for (ptrdiff_t i = 0; i < timeDim; ++i) nullStep[i] = norm2(NS[i, _]);
     //           column meansing in in order
     // const size_t numSymbols = getNumSymbols();
     auto A{dp->getA()};
@@ -478,12 +476,12 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     // E.resize(indexDim + nullDim, A.numCol());
     // ma0 loop
     for (ptrdiff_t i = 0; i < nco; ++i) {
-      for (ptrdiff_t j = 0; j < numSym; ++j) A(i + nco, j) = A(i, j) = B(i, j);
+      for (ptrdiff_t j = 0; j < numSym; ++j) A[i + nco, j] = A[i, j] = B[i, j];
       for (ptrdiff_t j = 0; j < numDepVar; ++j)
-        A(i + nco, j + numSym + numDepVar) = A(i, j + numSym) =
-          B(i, j + numSym);
+        A[i + nco, j + numSym + numDepVar] = A[i, j + numSym] =
+          B[i, j + numSym];
     }
-    A(_(nc, end), _(numSym, numSym + numVar)).diag() << 1;
+    A[_(nc, end), _(numSym, numSym + numVar)].diag() << 1;
     // L254: Assertion `col < numCol()` failed
     // indMats are [innerMostLoop, ..., outerMostLoop] x arrayDim
     // offsetMats are arrayDim x numSymbols
@@ -492,18 +490,18 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     // i_0 + j_0 - i_1 - j_1 = off_1 - off_0
     for (ptrdiff_t i = 0; i < indexDim; ++i) {
       for (ptrdiff_t j = 0; j < C.numCol(); ++j) {
-        int64_t Cji = C(i, j);
-        E(i, j + numSym) = Cji;
-        E(i, j + numSym + numDepVar) = -Cji;
+        int64_t Cji = C[i, j];
+        E[i, j + numSym] = Cji;
+        E[i, j + numSym + numDepVar] = -Cji;
       }
     }
     for (ptrdiff_t i = 0; i < timeDim; ++i) {
       for (ptrdiff_t j = 0; j < NS.numCol(); ++j) {
-        int64_t nsij = NS(i, j);
-        E(indexDim + i, j + numSym) = nsij;
-        E(indexDim + i, j + numSym + numDepVar) = -nsij;
+        int64_t nsij = NS[i, j];
+        E[indexDim + i, j + numSym] = nsij;
+        E[indexDim + i, j + numSym + numDepVar] = -nsij;
       }
-      E(indexDim + i, numSym + numVar + i) = 1;
+      E[indexDim + i, numSym + numVar + i] = 1;
     }
     dp->pruneBounds(*alloc);
     invariant(dp->getNumCon() > 0);
@@ -524,7 +522,7 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
   //
   // Time parameters are carried over into farkas polys
   [[nodiscard]] auto farkasPair(Arena<> *alloc) const
-    -> std::array<NotNull<math::Simplex>, 2> {
+    -> std::array<Valid<math::Simplex>, 2> {
 
     auto A{getA()}, E{getE()};
     const ptrdiff_t numEqualityConstraintsOld = ptrdiff_t(E.numRow());
@@ -551,28 +549,28 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     const ptrdiff_t numLambda = posEqEnd + numEqualityConstraintsOld;
     const ptrdiff_t numVarNew = numVarInterest + numLambda;
     invariant(ptrdiff_t(getNumLambda()), numLambda);
-    // std::array<NotNull<Simplex>, 2> pair;
-    NotNull<math::Simplex> fw =
+    // std::array<Valid<Simplex>, 2> pair;
+    Valid<math::Simplex> fw =
       math::Simplex::create(alloc, numConstraintsNew, numVarNew, 0);
     // Simplex &fw(pair[0]);
     // fw.resize(numConstraintsNew, numVarNew + 1);
     auto fCF{fw->getConstraints()};
     fCF << 0;
-    math::MutPtrMatrix<int64_t> fC{fCF(_, _(1, end))};
+    math::MutPtrMatrix<int64_t> fC{fCF[_, _(1, end)]};
     // fC(_, 0) << 0;
-    fC(0, 0) = 1; // lambda_0
-    fC(_, _(1, 1 + numInequalityConstraintsOld))
-      << A(_, _(math::begin, numConstraintsNew)).transpose();
-    // fC(_, _(ineqEnd, posEqEnd)) = E.transpose();
-    // fC(_, _(posEqEnd, numVarNew)) = -E.transpose();
+    fC[0, 0] = 1; // lambda_0
+    fC[_, _(1, 1 + numInequalityConstraintsOld)]
+      << A[_, _(math::begin, numConstraintsNew)].t();
+    // fC(_, _(ineqEnd, posEqEnd)) = E.t();
+    // fC(_, _(posEqEnd, numVarNew)) = -E.t();
     // loading from `E` is expensive
     // NOTE: if optimizing expression templates, should also
     // go through and optimize loops like this
     for (ptrdiff_t j = 0; j < numConstraintsNew; ++j) {
       for (ptrdiff_t i = 0; i < numEqualityConstraintsOld; ++i) {
-        int64_t Eji = E(i, j);
-        fC(j, i + ineqEnd) = Eji;
-        fC(j, i + posEqEnd) = -Eji;
+        int64_t Eji = E[i, j];
+        fC[j, i + ineqEnd] = Eji;
+        fC[j, i + posEqEnd] = -Eji;
       }
     }
     // schedule
@@ -595,16 +593,16 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     // ... == w + u'*N + psi
     // -1 as we flip sign
     for (ptrdiff_t i = 0; i < numBoundingCoefs; ++i)
-      fC(i, i + numScheduleCoefs + numLambda) = -1;
+      fC[i, i + numScheduleCoefs + numLambda] = -1;
 
     // so far, both have been identical
 
-    NotNull<math::Simplex> bw =
+    Valid<math::Simplex> bw =
       math::Simplex::create(alloc, numConstraintsNew, numVarNew, 0);
     auto bCF{bw->getConstraints()};
     bCF << fCF;
     // bCF(_, _(0, numVarNew + 1)) << fCF(_, _(0, numVarNew + 1));
-    math::MutPtrMatrix<int64_t> bC{bCF(_, _(1, end))};
+    math::MutPtrMatrix<int64_t> bC{bCF[_, _(1, end)]};
 
     // equality constraints get expanded into two inequalities
     // a == 0 ->
@@ -617,14 +615,14 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     // so that the ILP rLexMin on coefficients
     // will tend to preserve the initial order (which is
     // better than tending to reverse the initial order).
-    fC(0, numLambda) = 1;
-    fC(0, 1 + numLambda) = -1;
-    bC(0, numLambda) = -1;
-    bC(0, 1 + numLambda) = 1;
+    fC[0, numLambda] = 1;
+    fC[0, 1 + numLambda] = -1;
+    bC[0, numLambda] = -1;
+    bC[0, 1 + numLambda] = 1;
     for (ptrdiff_t i = 0; i < numPhiCoefs; ++i) {
       int64_t s = (2 * (i < numDep0Var) - 1);
-      fC(i + numBoundingCoefs, i + numLambda + 2) = s;
-      bC(i + numBoundingCoefs, i + numLambda + 2) = -s;
+      fC[i + numBoundingCoefs, i + numLambda + 2] = s;
+      bC[i + numBoundingCoefs, i + numLambda + 2] = -s;
     }
     // note that delta/constant coef is handled as last `s`
     return {fw, bw};
@@ -632,9 +630,9 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
 
   /// returns `true` if the array accesses are guaranteed independent
   /// conditioning on partial schedules xPhi and yPhi
-  [[nodiscard]] auto checkSat(Arena<> alloc, NotNull<const poly::Loop> xLoop,
+  [[nodiscard]] auto checkSat(Arena<> alloc, Valid<const poly::Loop> xLoop,
                               const int64_t *xOff, DensePtrMatrix<int64_t> xPhi,
-                              NotNull<const poly::Loop> yLoop,
+                              Valid<const poly::Loop> yLoop,
                               const int64_t *yOff, DensePtrMatrix<int64_t> yPhi)
     -> bool {
     // we take in loops because we might be moving deeper inside the loopnest
@@ -642,42 +640,42 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
     Row numPhi = xPhi.numRow();
     invariant(yPhi.numRow(), numPhi);
     DensePtrMatrix<int64_t> E{getE()};
-    unsigned xNumLoops = unsigned(xPhi.numCol()),
-             yNumLoops = unsigned(yPhi.numCol());
-    if ((numDep0Var == xNumLoops) || allZero(xPhi(_, _(numDep0Var, end))))
+    ptrdiff_t xNumLoops = ptrdiff_t(xPhi.numCol()),
+              yNumLoops = ptrdiff_t(yPhi.numCol());
+    if ((numDep0Var == xNumLoops) || allZero(xPhi[_, _(numDep0Var, end)]))
       xNumLoops = numDep0Var;
     else invariant(numDep0Var < xNumLoops);
-    if ((numDep1Var == yNumLoops) || allZero(yPhi(_, _(numDep1Var, end))))
+    if ((numDep1Var == yNumLoops) || allZero(yPhi[_, _(numDep1Var, end)]))
       yNumLoops = numDep1Var;
     else invariant(numDep1Var < yNumLoops);
     unsigned numSym = getNumSymbols(), numSymX = numSym + xNumLoops,
              numSymD0 = numSym + numDep0Var, nCol = numSymX + yNumLoops;
     MutDensePtrMatrix<int64_t> B{
-      matrix<int64_t>(&alloc, numEqCon + numPhi, nCol)};
+      matrix<int64_t>(&alloc, numEqCon + ptrdiff_t(numPhi), nCol)};
     bool extend = (numDep0Var != xNumLoops) || (numDep1Var != yNumLoops);
     // we truncate time dim
     if (extend || timeDim) {
       for (ptrdiff_t r = 0; r < numEqCon; ++r) {
-        B(r, _(0, numSymD0)) << E(r, _(0, numSymD0));
-        B(r, _(numDep0Var, xNumLoops) + numSym) << 0;
-        B(r, _(0, numDep1Var) + numSymX) << E(r, _(0, numDep1Var) + numSymD0);
-        B(r, _(numDep1Var, yNumLoops) + numSymX) << 0;
+        B[r, _(0, numSymD0)] << E[r, _(0, numSymD0)];
+        B[r, _(numDep0Var, xNumLoops) + numSym] << 0;
+        B[r, _(0, numDep1Var) + numSymX] << E[r, _(0, numDep1Var) + numSymD0];
+        B[r, _(numDep1Var, yNumLoops) + numSymX] << 0;
       }
     } else std::copy_n(E.begin(), E.numRow() * E.numCol(), B.begin());
     if (xOff)
       for (ptrdiff_t c = 0; c < numDep0Var; ++c)
         if (int64_t mlt = xOff[c])
-          B(_(0, numEqCon), 0) -= mlt * B(_(0, numEqCon), numSym + c);
+          B[_(0, numEqCon), 0] -= mlt * B[_(0, numEqCon), numSym + c];
     if (yOff)
       for (ptrdiff_t c = 0; c < numDep1Var; ++c)
         if (int64_t mlt = yOff[c])
-          B(_(0, numEqCon), 0) -= mlt * B(_(0, numEqCon), numSymX + c);
+          B[_(0, numEqCon), 0] -= mlt * B[_(0, numEqCon), numSymX + c];
     for (ptrdiff_t r = 0; r < numPhi; ++r) {
-      B(r + numEqCon, _(0, numSym)) << 0;
-      B(r + numEqCon, _(0, xNumLoops) + numSym) << xPhi(r, _(0, xNumLoops));
-      B(r + numEqCon, _(0, yNumLoops) + numSymX) << -yPhi(r, _(0, yNumLoops));
+      B[r + numEqCon, _(0, numSym)] << 0;
+      B[r + numEqCon, _(0, xNumLoops) + numSym] << xPhi[r, _(0, xNumLoops)];
+      B[r + numEqCon, _(0, yNumLoops) + numSymX] << -yPhi[r, _(0, yNumLoops)];
     }
-    unsigned rank = unsigned(math::NormalForm::simplifySystemImpl(B));
+    unsigned rank = ptrdiff_t(math::NormalForm::simplifySystemImpl(B));
     if (rank <= numEqCon) return false;
     unsigned numConstraints =
       extend ? (xLoop->getNumCon() + xNumLoops + yLoop->getNumCon() + yNumLoops)
@@ -699,29 +697,29 @@ class DepPoly : public BasePolyhedra<true, true, false, DepPoly> {
       // numSyms should be the same; we aren't pruning symbols
       invariant(numSym, 1 + nDS);
       for (ptrdiff_t r = 0; r < xCon; ++r) {
-        A(r, _(0, xNumSym)) << Ax(r, _(0, xNumSym));
-        A(r, _(xNumSym, numSym)) << 0;
-        A(r, _(0, xNumLoops) + numSym) << Ax(r, _(0, xNumLoops) + xNumSym);
-        A(r, _(0, yNumLoops) + numSymX) << 0;
+        A[r, _(0, xNumSym)] << Ax[r, _(0, xNumSym)];
+        A[r, _(xNumSym, numSym)] << 0;
+        A[r, _(0, xNumLoops) + numSym] << Ax[r, _(0, xNumLoops) + xNumSym];
+        A[r, _(0, yNumLoops) + numSymX] << 0;
       }
       for (ptrdiff_t r = 0; r < yCon; ++r) {
-        A(r + xCon, _(0, numSym)) << 0;
+        A[r + xCon, _(0, numSym)] << 0;
         for (ptrdiff_t j = 0; j < map.size(); ++j)
-          A(r + xCon, 1 + map[j]) = Ay(r, 1 + j);
-        A(r + xCon, _(0, xNumLoops) + numSym) << 0;
-        A(r + xCon, _(0, yNumLoops) + numSymX)
-          << Ay(r, _(0, yNumLoops) + yNumSym);
+          A[r + xCon, 1 + map[j]] = Ay[r, 1 + j];
+        A[r + xCon, _(0, xNumLoops) + numSym] << 0;
+        A[r + xCon, _(0, yNumLoops) + numSymX]
+          << Ay[r, _(0, yNumLoops) + yNumSym];
       }
       std::fill(A.begin() + size_t(xCon + yCon) * nCol, A.end(), 0);
-      A(_(0, nLoop) + (xCon + yCon), _(0, nLoop) + numSym).diag() << 1;
-    } else dp->getA() << getA()(_, _(0, nCol)); // truncate time
+      A[_(0, nLoop) + (xCon + yCon), _(0, nLoop) + numSym].diag() << 1;
+    } else dp->getA() << getA()[_, _(0, nCol)]; // truncate time
     if (xOff)
       for (ptrdiff_t c = 0; c < xNumLoops; ++c)
-        if (int64_t mlt = xOff[c]) A(_, 0) -= mlt * A(_, numSym + c);
+        if (int64_t mlt = xOff[c]) A[_, 0] -= mlt * A[_, numSym + c];
     if (yOff)
       for (ptrdiff_t c = 0; c < yNumLoops; ++c)
-        if (int64_t mlt = yOff[c]) A(_, 0) -= mlt * A(_, numSymX + c);
-    dp->getE() << B(_(0, rank), _);
+        if (int64_t mlt = yOff[c]) A[_, 0] -= mlt * A[_, numSymX + c];
+    dp->getE() << B[_(0, rank), _];
     dp->pruneBounds(alloc);
     return dp->getNumCon() == 0;
   }
diff --git a/include/Polyhedra/Loops.hpp b/include/Polyhedra/Loops.hpp
index 970d90c1f..7c681d9a6 100644
--- a/include/Polyhedra/Loops.hpp
+++ b/include/Polyhedra/Loops.hpp
@@ -1,20 +1,20 @@
 #pragma once
 
+#include "Containers/Pair.hpp"
 #include "Polyhedra/Comparators.hpp"
 #include "Polyhedra/Polyhedra.hpp"
 #include "RemarkAnalysis.hpp"
+#include <Alloc/Arena.hpp>
 #include <Math/Array.hpp>
 #include <Math/Comparisons.hpp>
 #include <Math/Constraints.hpp>
 #include <Math/Indexing.hpp>
 #include <Math/Math.hpp>
 #include <Math/MatrixDimensions.hpp>
-#include <Utilities/Allocators.hpp>
 #include <Utilities/Optional.hpp>
 #include <Utilities/Valid.hpp>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Analysis/LoopInfo.h>
@@ -36,7 +36,7 @@
 
 namespace poly::poly {
 using math::IntMatrix, math::PtrVector, math::PtrMatrix, math::MutPtrMatrix;
-using utils::Optional, utils::NotNull, utils::invariant;
+using utils::Optional, utils::Valid, utils::invariant;
 inline auto isKnownOne(llvm::ScalarEvolution &SE, llvm::Value *v) -> bool {
   return v && SE.getSCEV(v)->isOne();
 }
@@ -113,26 +113,26 @@ findSymbolicIndex(llvm::ArrayRef<const llvm::SCEV *> symbols,
 
 [[nodiscard]] inline auto getMinMaxValueSCEV(llvm::ScalarEvolution &SE,
                                              const llvm::SCEVAddRecExpr *S)
-  -> std::pair<const llvm::SCEV *, const llvm::SCEV *> {
+  -> containers::Pair<const llvm::SCEV *, const llvm::SCEV *> {
   // if (!SE.containsAddRecurrence(S))
   // 	return S;
-  if ((!S) || (!(S->isAffine()))) return std::make_pair(S, S);
+  if ((!S) || (!(S->isAffine()))) return {S, S};
   const auto *opStart = S->getStart();
   const auto *opStep = S->getStepRecurrence(SE);
   const auto *opFinal = SE.getSCEVAtScope(S, nullptr);
   // auto opFinal = SE.getSCEVAtScope(S, S->getLoop()->getParentLoop());
   // FIXME: what if there are more AddRecs nested inside?
-  if (SE.isKnownNonNegative(opStep)) return std::make_pair(opStart, opFinal);
-  if (SE.isKnownNonPositive(opStep)) return std::make_pair(opFinal, opStart);
-  return std::make_pair(S, S);
+  if (SE.isKnownNonNegative(opStep)) return {opStart, opFinal};
+  if (SE.isKnownNonPositive(opStep)) return {opFinal, opStart};
+  return {S, S};
 }
 // TODO: strengthen through recursion
 [[nodiscard]] inline auto getMinMaxValueSCEV(llvm::ScalarEvolution &SE,
                                              const llvm::SCEV *S)
-  -> std::pair<const llvm::SCEV *, const llvm::SCEV *> {
+  -> containers::Pair<const llvm::SCEV *, const llvm::SCEV *> {
   if (const auto *T = llvm::dyn_cast<llvm::SCEVAddRecExpr>(S))
     return getMinMaxValueSCEV(SE, T);
-  return std::make_pair(S, S);
+  return {S, S};
 }
 [[nodiscard]] inline auto simplifyMinMax(llvm::ScalarEvolution &SE,
                                          const llvm::SCEVMinMaxExpr *S)
@@ -168,22 +168,22 @@ namespace loopNestCtor {
 /// we try to break down value `v`, so that adding
 /// N, N - 1, N - 3 only adds the variable `N`, and adds the constant
 /// offsets
-inline void addSymbol(IntMatrix &A,
+inline void addSymbol(IntMatrix<math::StridedDims<>> &A,
                       llvm::SmallVectorImpl<const llvm::SCEV *> &symbols,
                       const llvm::SCEV *v, math::Range<ptrdiff_t, ptrdiff_t> lu,
                       int64_t mlt) {
   assert(lu.size());
   symbols.push_back(v);
-  A.resize(A.numCol() + 1);
-  A(lu, symbols.size()) << mlt;
+  A.resize(++auto{A.numCol()});
+  A[lu, symbols.size()] << mlt;
 }
 inline auto addRecMatchesLoop(const llvm::SCEV *S, llvm::Loop *L) -> bool {
   if (const auto *x = llvm::dyn_cast<const llvm::SCEVAddRecExpr>(S))
     return x->getLoop() == L;
   return false;
 }
-[[nodiscard]] inline auto
-addSymbol(std::array<IntMatrix, 2> &AB, // NOLINT(misc-no-recursion)
+[[nodiscard]] inline auto // NOLINTNEXTLINE(misc-no-recursion)
+addSymbol(std::array<IntMatrix<math::StridedDims<>>, 2> &AB,
           llvm::SmallVectorImpl<const llvm::SCEV *> &symbols, llvm::Loop *L,
           const llvm::SCEV *v, llvm::ScalarEvolution &SE,
           math::Range<ptrdiff_t, ptrdiff_t> lu, int64_t mlt, ptrdiff_t minDepth)
@@ -191,11 +191,11 @@ addSymbol(std::array<IntMatrix, 2> &AB, // NOLINT(misc-no-recursion)
   auto &[A, B] = AB;
   // first, we check if `v` in `Symbols`
   if (ptrdiff_t i = findSymbolicIndex(symbols, v)) {
-    A(lu, i) += mlt;
+    A[lu, i] += mlt;
     return minDepth;
   }
   if (std::optional<int64_t> c = getConstantInt(v)) {
-    A(lu, 0) += mlt * (*c);
+    A[lu, 0] += mlt * (*c);
     return minDepth;
   }
   if (const auto *ar = llvm::dyn_cast<const llvm::SCEVAddExpr>(v)) {
@@ -221,8 +221,7 @@ addSymbol(std::array<IntMatrix, 2> &AB, // NOLINT(misc-no-recursion)
       minDepth =
         addSymbol(AB, symbols, L, x->getOperand(0), SE, lu, mlt, minDepth);
       if (auto opc = getConstantInt(x->getOperand(1))) {
-        // swap order vs recDepth to go inner<->outer
-        B(lu, recDepth - 1) << mlt * (*opc);
+        B[lu, recDepth - 1] << mlt * (*opc);
         return minDepth;
       }
       v = SE.getAddRecExpr(SE.getZero(x->getOperand(0)->getType()),
@@ -242,11 +241,11 @@ addSymbol(std::array<IntMatrix, 2> &AB, // NOLINT(misc-no-recursion)
     const llvm::SCEV *op1 = mm->getOperand(1);
     if (isMin ^ (mlt < 0)) { // we can represent this as additional constraints
       Row M = A.numRow();
-      Row Mp = M + std::ssize(lu);
+      Row Mp = Row<>{ptrdiff_t(M) + std::ssize(lu)};
       A.resize(Mp);
       B.resize(Mp);
-      A(_(M, Mp), _) = A(lu, _);
-      B(_(M, Mp), _) = B(lu, _);
+      A[_(M, Mp), _] = A[lu, _];
+      B[_(M, Mp), _] = B[lu, _];
       minDepth = addSymbol(AB, symbols, L, op0, SE, lu, mlt, minDepth);
       minDepth = addSymbol(AB, symbols, L, op1, SE, _(M, Mp), mlt, minDepth);
     } else if (addRecMatchesLoop(op0, L)) {
@@ -260,29 +259,29 @@ addSymbol(std::array<IntMatrix, 2> &AB, // NOLINT(misc-no-recursion)
   return minDepth;
 }
 inline auto
-areSymbolsLoopInvariant(IntMatrix &A,
+areSymbolsLoopInvariant(IntMatrix<math::StridedDims<>> &A,
                         llvm::SmallVectorImpl<const llvm::SCEV *> &symbols,
                         llvm::Loop *L, llvm::ScalarEvolution &SE) -> bool {
   for (ptrdiff_t i = 0; i < std::ssize(symbols); ++i)
-    if ((!allZero(A(_, i + 1))) && (!SE.isLoopInvariant(symbols[i], L)))
+    if ((!allZero(A[_, i + 1])) && (!SE.isLoopInvariant(symbols[i], L)))
       return false;
   return true;
 }
 inline auto // NOLINTNEXTLINE(misc-no-recursion)
-addBackedgeTakenCount(std::array<IntMatrix, 2> &AB,
+addBackedgeTakenCount(std::array<IntMatrix<math::StridedDims<>>, 2> &AB,
                       llvm::SmallVectorImpl<const llvm::SCEV *> &symbols,
                       llvm::Loop *L, const llvm::SCEV *BT,
                       llvm::ScalarEvolution &SE, ptrdiff_t minDepth,
                       llvm::OptimizationRemarkEmitter *ORE) -> ptrdiff_t {
   // A contains syms
   auto &[A, B] = AB;
-  Row M = A.numRow();
-  A.resize(M + 1);
-  B.resize(M + 1);
-  minDepth = addSymbol(AB, symbols, L, BT, SE, _(M, M + 1), 1, minDepth);
+  Row M = A.numRow(), MM = M;
+  A.resize(++MM);
+  B.resize(MM);
+  minDepth = addSymbol(AB, symbols, L, BT, SE, _(M, MM), 1, minDepth);
   assert(A.numRow() == B.numRow());
   ptrdiff_t depth = L->getLoopDepth() - 1;
-  for (auto m = ptrdiff_t(M); m < A.numRow(); ++m) B(m, depth) = -1; // indvar
+  for (auto m = ptrdiff_t(M); m < A.numRow(); ++m) B[m, depth] = -1; // indvar
   // recurse, if possible to add an outer layer
   if (llvm::Loop *P = L->getParentLoop()) {
     if (areSymbolsLoopInvariant(A, symbols, P, SE)) {
@@ -360,19 +359,19 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
   static inline auto construct(Arena<> *alloc, llvm::Loop *L,
                                const llvm::SCEV *BT, llvm::ScalarEvolution &SE,
                                llvm::OptimizationRemarkEmitter *ORE = nullptr)
-    -> NotNull<Loop> {
+    -> Valid<Loop> {
     // A holds symbols
     // B holds loop bounds
     // they're separate so we can grow them independently
-    std::array<IntMatrix, 2> AB;
+    std::array<IntMatrix<math::StridedDims<>>, 2> AB;
     auto &[A, B] = AB;
     // once we're done assembling these, we'll concatenate A and B
     unsigned maxDepth = L->getLoopDepth();
     invariant(maxDepth > 0);
     // ptrdiff_t maxNumSymbols = BT->getExpressionSize();
     A.resizeForOverwrite(
-      math::StridedDims{0, 1, unsigned(1) + BT->getExpressionSize()});
-    B.resizeForOverwrite(math::StridedDims{0, maxDepth, maxDepth});
+      math::StridedDims<>{{0}, {1}, {ptrdiff_t(1) + BT->getExpressionSize()}});
+    B.resizeForOverwrite(math::StridedDims<>{{0}, {maxDepth}, {maxDepth}});
     llvm::SmallVector<const llvm::SCEV *> symbols;
     ptrdiff_t minDepth =
       loopNestCtor::addBackedgeTakenCount(AB, symbols, L, BT, SE, 0, ORE);
@@ -387,7 +386,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
       // search B(_,d) for references
       for (ptrdiff_t i = 0; i < B.numRow(); ++i) {
         // TODO; confirm `last` vs `end`
-        if (int64_t Bid = B(i, d)) {
+        if (int64_t Bid = B[i, d]) {
           if (!P) { // find P
             P = L;
             for (ptrdiff_t r = d + 1; r < maxDepth; ++r) P = P->getParentLoop();
@@ -404,20 +403,45 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     }
     invariant(1 + std::ssize(symbols), ptrdiff_t(A.numCol()));
     ptrdiff_t depth = maxDepth - minDepth;
-    unsigned numConstraints = unsigned(A.numRow()), N = unsigned(A.numCol());
-    NotNull<Loop> aln{
+    ptrdiff_t numConstraints = ptrdiff_t(A.numRow()), N = ptrdiff_t(A.numCol());
+    Valid<Loop> aln{
       Loop::allocate(alloc, L, numConstraints, depth, symbols, maxDepth)};
-    aln->getA()(_, _(0, N)) << A;
+    aln->getA()[_, _(0, N)] << A;
     // copy the included loops from B
     // we use outer <-> inner order, so we skip unsupported outer loops.
-    aln->getA()(_, _(N, N + depth)) << B(_, _(end - depth, end));
+    aln->getA()[_, _(N, N + depth)] << B[_, _(end - depth, end)];
     return aln;
     // addZeroLowerBounds();
     // NOTE: pruneBounds() is not legal here if we wish to use
     // removeInnerMost later.
     // pruneBounds();
   }
-
+  /// Gives a very rough trip count estimate (second return value)
+  /// with a boolean fist arg indicating whether it is exact or estimated.
+  /// The estimation approach here can be seriously improved.
+  /// Currently, if not exact, it simply returns 128.
+  [[nodiscard]] auto tripCount(ptrdiff_t depth) const
+    -> std::array<uint16_t, 2> {
+    auto A{getA()};
+    // `i` is position of depth's indvar
+    ptrdiff_t i = 1 + numDynSymbols + depth, j = -1, k = -1;
+    // `A * loopindvars >= 0`
+    // Aci >= 0 is a lower bound
+    // Aci <= 0 is an upper bound
+    for (ptrdiff_t c = 0; c < A.numRow(); ++c) {
+      int64_t Aci = A[c, i];
+      if (Aci > 0) {
+        if ((j >= 0) || (!math::allZero(A[c, _(1, i)]))) return {0, 128};
+        j = c;
+      } else if (Aci < 0) {
+        if ((k >= 0) || (!math::allZero(A[c, _(1, i)]))) return {0, 128};
+        k = c;
+      }
+    }
+    invariant(j >= 0); // must have lower bound
+    invariant(k >= 0); // must have upper bound
+    return {1, std::min<uint16_t>(0xffff, A[k, 0] - A[j, 0])};
+  }
   auto findIndex(const llvm::SCEV *v) const -> ptrdiff_t {
     return findSymbolicIndex(getSyms(), v);
   }
@@ -431,9 +455,8 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
   /// offset the loops by `offsets`, e.g. if we have
   /// offsets[0] = 2, then the first loop is shifted by 2.
   /// this shifting is applied before rotation.
-  [[nodiscard]] constexpr auto rotate(Arena<> *alloc, DensePtrMatrix<int64_t> R,
-                                      const int64_t *offsets) const
-    -> NotNull<Loop> {
+  [[nodiscard]] auto rotate(Arena<> *alloc, DensePtrMatrix<int64_t> R,
+                            const int64_t *offsets) const -> Valid<Loop> {
     // if offsets is not null, we have the equivalent of
     // A * O * [I 0; 0 R]
     // where O = I - [0 0; offsets 0],
@@ -442,21 +465,21 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     bool thisNonNeg = isNonNegative(), nonNeg = thisNonNeg && allGEZero(R),
          addExtra = thisNonNeg != nonNeg;
     if (addExtra) numExtraVar = getNumLoops();
-    invariant(unsigned(R.numCol()), getNumLoops());
-    invariant(unsigned(R.numRow()), getNumLoops());
+    invariant(ptrdiff_t(R.numCol()), getNumLoops());
+    invariant(ptrdiff_t(R.numRow()), getNumLoops());
     auto A{getA()};
-    const auto [M, N] = A.size();
+    const auto [M, N] = shape(A);
     auto syms{getSyms()};
-    NotNull<Loop> aln{Loop::allocate(alloc, L, ptrdiff_t(M) + numExtraVar,
-                                     numLoops, syms, nonNeg)};
+    Valid<Loop> aln{Loop::allocate(alloc, L, ptrdiff_t(M) + numExtraVar,
+                                   numLoops, syms, nonNeg)};
     auto B{aln->getA()};
-    invariant(B.numRow(), M + numExtraVar);
-    invariant(B.numCol(), N);
-    B(_(0, M), _(0, numConst)) << A(_, _(0, numConst));
-    B(_(0, M), _(numConst, end)) << A(_, _(numConst, end)) * R;
+    invariant(B.numRow() == M + numExtraVar);
+    invariant(B.numCol() == N);
+    B[_(0, M), _(0, numConst)] << A[_, _(0, numConst)];
+    B[_(0, M), _(numConst, end)] << A[_, _(numConst, end)] * R;
     if (addExtra) {
-      B(_(M, end), _(0, numConst)) << 0;
-      B(_(M, end), _(numConst, end)) << R;
+      B[_(M, end), _(0, numConst)] << 0;
+      B[_(M, end), _(numConst, end)] << R;
     }
     // A * O * [I 0; 0 R] = A * [I 0; 0 R] - A * [0 0; offs 0] * [I 0; 0 R]
     // above, we computed `A * [I 0; 0 R]`, now if offsets != nullptr,
@@ -470,8 +493,8 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     if (offsets) {
       for (ptrdiff_t l = 0, D = getNumLoops(); l < D; ++l) {
         if (int64_t mlt = offsets[l]) {
-          B(_(0, M), 0) -= mlt * A(_, numConst + l);
-          if (addExtra) B(M + l, 0) = -mlt;
+          B[_(0, M), 0] -= mlt * A[_, numConst + l];
+          if (addExtra) B[M + l, 0] = -mlt;
         }
       }
     }
@@ -479,30 +502,30 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     return aln;
   }
   [[nodiscard]] constexpr auto rotate(Arena<> *alloc, DensePtrMatrix<int64_t> R,
-                                      const int64_t *offsets) -> NotNull<Loop> {
+                                      const int64_t *offsets) -> Valid<Loop> {
     if (R == math::I) return this;
     return ((const Loop *)this)->rotate(alloc, R, offsets);
   }
 
-  [[nodiscard]] auto removeInnerMost(Arena<> *alloc) const -> NotNull<Loop> {
+  [[nodiscard]] auto removeInnerMost(Arena<> *alloc) const -> Valid<Loop> {
     // order is outer<->inner
     auto A{getA()};
-    auto ret = Loop::allocate(alloc, L->getParentLoop(), unsigned(A.numRow()),
+    auto ret = Loop::allocate(alloc, L->getParentLoop(), ptrdiff_t(A.numRow()),
                               getNumLoops() - 1, getSyms(), isNonNegative());
     MutPtrMatrix<int64_t> B{ret->getA()};
-    B << A(_, _(0, last));
+    B << A[_, _(0, last)];
     // no loop may be conditioned on the innermost loop, so we should be able to
     // safely remove all constraints that reference it
     for (Row m = B.numRow(); m--;) {
-      if (A(m, last)) {
-        if (m != B.numRow() - 1) B(m, _) << B(last, _);
-        B.truncate(B.numRow() - 1);
+      if (A[m, last]) {
+        if (m != --auto{B.numRow()}) B[m, _] << B[last, _];
+        B.truncate(--B.numRow());
       }
     }
-    ret->truncateConstraints(unsigned(B.numRow()));
+    ret->truncateConstraints(ptrdiff_t(B.numRow()));
     return ret;
   }
-  constexpr void truncateConstraints(unsigned newNumConstraints) {
+  constexpr void truncateConstraints(ptrdiff_t newNumConstraints) {
     assert(newNumConstraints <= numConstraints);
     numConstraints = newNumConstraints;
   }
@@ -549,34 +572,33 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     ptrdiff_t M = numConstraints;
     numConstraints += numLoops;
     auto A{getA()};
-    A(_(M, end), _) << 0;
-    for (ptrdiff_t i = 0; i < numLoops; ++i) A(M + i, end - numLoops + i) = 1;
+    A[_(M, end), _] << 0;
+    for (ptrdiff_t i = 0; i < numLoops; ++i) A[M + i, end - numLoops + i] = 1;
     // this->pruneBounds(alloc);
   }
 
   [[nodiscard]] constexpr auto getProgVars(ptrdiff_t j) const
     -> PtrVector<int64_t> {
-    return getA()(j, _(0, getNumSymbols()));
+    return getA()[j, _(0, getNumSymbols())];
   }
-  [[nodiscard]] constexpr auto copy(Arena<> *alloc) const -> NotNull<Loop> {
+  [[nodiscard]] auto copy(Arena<> *alloc) const -> Valid<Loop> {
     auto ret = Loop::allocate(alloc, L, numConstraints, numLoops, getSyms(),
                               isNonNegative());
     ret->getA() << getA();
     return ret;
   }
-  [[nodiscard]] constexpr auto removeLoop(Arena<> *alloc, ptrdiff_t v) const
-    -> Loop * {
+  [[nodiscard]] auto removeLoop(Arena<> *alloc, ptrdiff_t v) const -> Loop * {
     auto A{getA()};
     v += getNumSymbols();
-    auto zeroNegPos = indsZeroNegPos(A(_, v));
+    auto zeroNegPos = indsZeroNegPos(A[_, v]);
     auto &[zer, neg, pos] = zeroNegPos;
-    unsigned numCon =
-      unsigned(A.numRow()) - pos.size() + neg.size() * pos.size();
+    ptrdiff_t numCon =
+      ptrdiff_t(A.numRow()) - pos.size() + neg.size() * pos.size();
     if (!isNonNegative()) numCon -= neg.size();
     auto p = checkpoint(alloc);
     auto ret = Loop::allocate(alloc, nullptr, numCon, numLoops - 1, getSyms(),
                               isNonNegative());
-    ret->numConstraints = unsigned(
+    ret->numConstraints = ptrdiff_t(
       isNonNegative()
         ? fourierMotzkinCore<true>(ret->getA(), getA(), v, zeroNegPos)
         : fourierMotzkinCore<false>(ret->getA(), getA(), v, zeroNegPos));
@@ -590,7 +612,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     return ret;
   }
   constexpr void eraseConstraint(ptrdiff_t c) {
-    eraseConstraintImpl(getA(), c);
+    eraseConstraintImpl(getA(), Row<>{c});
     --numConstraints;
   }
   [[nodiscard]] auto zeroExtraItersUponExtending(Arena<> alloc, ptrdiff_t _i,
@@ -610,23 +632,23 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     const ptrdiff_t numConst = getNumSymbols();
     auto A{tmp->getA()};
     for (ptrdiff_t n = 0; n < A.numRow(); ++n)
-      if ((A(n, numConst) != 0) && (A(n, 1 + numConst) != 0)) indep = false;
+      if ((A[n, numConst] != 0) && (A[n, 1 + numConst] != 0)) indep = false;
     if (indep) return false;
     Loop *margi = tmp->removeLoop(&alloc, 1), *tmp2;
-    invariant(margi->getNumLoops(), unsigned(1));
-    invariant(tmp->getNumLoops(), unsigned(2));
-    invariant(margi->getA().numCol() + 1, tmp->getA().numCol());
+    invariant(margi->getNumLoops(), ptrdiff_t(1));
+    invariant(tmp->getNumLoops(), ptrdiff_t(2));
+    invariant(++auto{margi->getA().numCol()}, tmp->getA().numCol());
     // margi contains extrema for `_i`
     // we can substitute extended for value of `_i`
     // in `tmp`
     auto p2 = alloc.checkpoint();
     int64_t sign = 2 * extendLower - 1; // extendLower ? 1 : -1
     for (ptrdiff_t c = 0; c < margi->getNumInequalityConstraints(); ++c) {
-      int64_t b = sign * margi->getA()(c, numConst);
+      int64_t b = sign * margi->getA()[c, numConst];
       if (b <= 0) continue;
       alloc.rollback(p2);
       tmp2 = tmp->copy(&alloc);
-      invariant(tmp2->getNumLoops(), unsigned(2));
+      invariant(tmp2->getNumLoops(), ptrdiff_t(2));
       invariant(margi->getNumLoops() + 1, tmp2->getNumLoops());
       // increment to increase bound
       // this is correct for both extending lower and extending upper
@@ -634,18 +656,18 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
       // upper: a'x - i + b >= 0 -> i <=  a'x + b
       // to decrease the lower bound or increase the upper, we increment
       // `b`
-      ++(margi->getA())(c, 0);
+      ++(margi->getA())[c, 0];
       // our approach here is to set `_i` equal to the extended bound
       // and then check if the resulting polyhedra is empty.
       // if not, then we may have >0 iterations.
       for (ptrdiff_t cc = 0; cc < tmp2->getNumCon(); ++cc) {
-        if (int64_t d = tmp2->getA()(cc, numConst)) {
-          tmp2->getA()(cc, _(0, last)) << b * tmp2->getA()(cc, _(0, last)) -
-                                            (d * sign) * margi->getA()(c, _);
+        if (int64_t d = tmp2->getA()[cc, numConst]) {
+          tmp2->getA()[cc, _(0, last)] << b * tmp2->getA()[cc, _(0, last)] -
+                                            (d * sign) * margi->getA()[c, _];
         }
       }
       for (auto cc = ptrdiff_t(tmp2->getNumCon()); cc;)
-        if (tmp2->getA()(--cc, 1 + numConst) == 0) tmp2->eraseConstraint(cc);
+        if (tmp2->getA()[--cc, 1 + numConst] == 0) tmp2->eraseConstraint(cc);
       if (!(tmp2->calcIsEmpty(alloc))) return false;
     }
     if (isNonNegative()) {
@@ -659,16 +681,16 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
         // extended bound and then check if the resulting polyhedra is
         // empty. if not, then we may have >0 iterations.
         for (ptrdiff_t cc = 0; cc < tmp->getNumCon(); ++cc) {
-          if (int64_t d = tmp->getA()(cc, numConst)) {
+          if (int64_t d = tmp->getA()[cc, numConst]) {
             // lower bound is i >= 0
             // so setting equal to the extended lower bound now
             // means that i = -1 so we decrement `d` from the column
-            tmp->getA()(cc, 0) -= d;
-            tmp->getA()(cc, numConst) = 0;
+            tmp->getA()[cc, 0] -= d;
+            tmp->getA()[cc, numConst] = 0;
           }
         }
         for (auto cc = ptrdiff_t(tmp->getNumCon()); cc;)
-          if (tmp->getA()(--cc, 1 + numConst) == 0) tmp->eraseConstraint(cc);
+          if (tmp->getA()[--cc, 1 + numConst] == 0) tmp->eraseConstraint(cc);
         if (!(tmp->calcIsEmpty(alloc))) return false;
       }
     }
@@ -706,7 +728,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     DensePtrMatrix<int64_t> A{getA()};
     bool printed = printSymbol(os, b, -sign);
     for (ptrdiff_t k = 0; k < numVarMinus1; ++k) {
-      if (int64_t lakj = A(j, k + numConst)) {
+      if (int64_t lakj = A[j, k + numConst]) {
         if (lakj * sign > 0) os << " - ";
         else if (printed) os << " + ";
         lakj = math::constexpr_abs(lakj);
@@ -730,7 +752,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     if (numRow > 1) os << (isUpper ? "min(" : "max(");
     DensePtrMatrix<int64_t> A{getA()};
     for (ptrdiff_t j = 0, k = 0; j < A.numRow(); ++j) {
-      if (A(j, last) * sign <= 0) continue;
+      if (A[j, last] * sign <= 0) continue;
       if (k++) os << ", ";
       printBound(os, sign, numVarMinus1, numConst, j);
     }
@@ -753,7 +775,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     ptrdiff_t numRow = 0;
     int64_t allAj = 0;
     for (ptrdiff_t j = 0; j < A.numRow(); ++j) {
-      int64_t Ajr = A(j, last), Aj = Ajr * sign;
+      int64_t Ajr = A[j, last], Aj = Ajr * sign;
       if (Aj <= 0) continue;
       if (allAj) allAj = allAj == Aj ? allAj : -1;
       else allAj = Aj;
@@ -769,7 +791,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     if (allAj > 0)
       return printBoundShort(os, sign, numVarM1, numConst, allAj, numRow, true);
     for (ptrdiff_t j = 0; j < A.numRow(); ++j) {
-      int64_t Ajr = A(j, end - 1), Aj = Ajr * sign;
+      int64_t Ajr = A[j, end - 1], Aj = Ajr * sign;
       if (Aj <= 0) continue;
       if (hasPrintedLine)
         for (ptrdiff_t k = 0; k < 21; ++k) os << ' ';
@@ -790,7 +812,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     int64_t allAj = 0;
     ptrdiff_t numPos = 0, numNeg = 0;
     for (ptrdiff_t j = 0; j < A.numRow(); ++j) {
-      int64_t Ajr = A(j, last);
+      int64_t Ajr = A[j, last];
       if (Ajr == 0) continue;
       numPos += Ajr > 0;
       numNeg += Ajr < 0;
@@ -823,45 +845,52 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
   // We pop off the outer most loop on every iteration.
   friend inline auto operator<<(llvm::raw_ostream &os, const Loop &aln)
     -> llvm::raw_ostream & {
-    utils::OwningArena<> alloc;
+    alloc::OwningArena<> alloc;
     aln.dump(os, &alloc);
     return os;
   }
 #ifndef NDEBUG
   [[gnu::used]] void dump() const { llvm::errs() << *this; }
 #endif
-  [[nodiscard]] constexpr auto getNumCon() const -> unsigned {
+  [[nodiscard]] constexpr auto getNumCon() const -> ptrdiff_t {
     return numConstraints;
   }
   [[nodiscard]] constexpr auto getA() -> MutDensePtrMatrix<int64_t> {
     const void *ptr =
       memory + sizeof(const llvm::SCEV *const *) * numDynSymbols;
     auto *p = (int64_t *)const_cast<void *>(ptr);
-    return {p, math::DenseDims{numConstraints, numLoops + numDynSymbols + 1}};
+    return {
+      p, math::DenseDims<>{{numConstraints}, {numLoops + numDynSymbols + 1}}};
   };
+  /// returns the `A` where `A * i >= 0`, `i` are loop indvars
+  /// Number of rows indicate number of constraints, columns are
+  /// /// returns the `A` where `A * i >= 0`, `i` are loop indvars
+  /// Number of rows indicate number of constraints, columns are
+  /// 1 (constant) + numDynSymbols + number of loops
   [[nodiscard]] constexpr auto getA() const -> DensePtrMatrix<int64_t> {
     const void *ptr =
       memory + sizeof(const llvm::SCEV *const *) * numDynSymbols;
     auto *p = (int64_t *)const_cast<void *>(ptr);
-    return {p, math::DenseDims{numConstraints, numLoops + numDynSymbols + 1}};
+    return {
+      p, math::DenseDims<>{{numConstraints}, {numLoops + numDynSymbols + 1}}};
   };
-  [[nodiscard]] constexpr auto getOuterA(unsigned subLoop)
+  [[nodiscard]] constexpr auto getOuterA(ptrdiff_t subLoop)
     -> MutPtrMatrix<int64_t> {
     const void *ptr =
       memory + sizeof(const llvm::SCEV *const *) * numDynSymbols;
     auto *p = (int64_t *)const_cast<void *>(ptr);
-    unsigned numSym = numDynSymbols + 1;
-    return {p, math::StridedDims{numConstraints, subLoop + numSym,
-                                 numLoops + numSym}};
+    ptrdiff_t numSym = numDynSymbols + 1;
+    return {p, math::StridedDims<>{
+                 {numConstraints}, {subLoop + numSym}, {numLoops + numSym}}};
   };
-  [[nodiscard]] constexpr auto getOuterA(unsigned subLoop) const
+  [[nodiscard]] constexpr auto getOuterA(ptrdiff_t subLoop) const
     -> PtrMatrix<int64_t> {
     const void *ptr =
       memory + sizeof(const llvm::SCEV *const *) * numDynSymbols;
     auto *p = (int64_t *)const_cast<void *>(ptr);
-    unsigned numSym = numDynSymbols + 1;
-    return {p, math::StridedDims{numConstraints, subLoop + numSym,
-                                 numLoops + numSym}};
+    ptrdiff_t numSym = numDynSymbols + 1;
+    return {p, math::StridedDims<>{
+                 {numConstraints}, {subLoop + numSym}, {numLoops + numSym}}};
   };
   [[nodiscard]] auto getSyms() -> llvm::MutableArrayRef<const llvm::SCEV *> {
     void *ptr = memory;
@@ -871,31 +900,31 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     const void *ptr = memory;
     return {(const llvm::SCEV *const *)ptr, numDynSymbols};
   }
-  [[nodiscard]] constexpr auto getNumLoops() const -> unsigned {
+  [[nodiscard]] constexpr auto getNumLoops() const -> ptrdiff_t {
     return numLoops;
   }
-  [[nodiscard]] constexpr auto getNumSymbols() const -> unsigned {
+  [[nodiscard]] constexpr auto getNumSymbols() const -> ptrdiff_t {
     return numDynSymbols + 1;
   }
-  constexpr void truncNumInEqCon(Row r) {
+  constexpr void truncNumInEqCon(Row<> r) {
     invariant(r < numConstraints);
-    numConstraints = unsigned(r);
+    numConstraints = ptrdiff_t(r);
   }
 
   [[nodiscard]] static auto construct(Arena<> *alloc, llvm::Loop *L,
                                       PtrMatrix<int64_t> A,
                                       llvm::ArrayRef<const llvm::SCEV *> syms,
                                       bool nonNeg) -> Loop * {
-    unsigned numLoops = unsigned(A.numCol()) - 1 - syms.size();
+    ptrdiff_t numLoops = ptrdiff_t(A.numCol()) - 1 - syms.size();
     Loop *aln =
-      allocate(alloc, L, unsigned(A.numRow()), numLoops, syms, nonNeg);
+      allocate(alloc, L, ptrdiff_t(A.numRow()), numLoops, syms, nonNeg);
     aln->getA() << A;
     return aln;
   }
   [[nodiscard]] static auto allocate(Arena<> *alloc, llvm::Loop *L,
                                      unsigned numCon, unsigned numLoops,
                                      llvm::ArrayRef<const llvm::SCEV *> syms,
-                                     bool nonNegative) -> NotNull<Loop> {
+                                     bool nonNegative) -> Valid<Loop> {
     unsigned numDynSym = syms.size();
     unsigned N = numLoops + numDynSym + 1;
     // extra capacity for adding 0 lower bounds later, see
@@ -908,7 +937,7 @@ class Loop : public BasePolyhedra<false, true, true, Loop> {
     auto *mem = (Loop *)alloc->allocate(sizeof(Loop) + memNeeded);
     auto *aln = std::construct_at(mem, L, numCon, numLoops, numDynSym, M);
     std::copy_n(syms.begin(), numDynSym, aln->getSyms().begin());
-    return NotNull<Loop>{aln};
+    return Valid<Loop>{aln};
   }
   explicit constexpr Loop(llvm::Loop *loop, unsigned _numConstraints,
                           unsigned _numLoops, unsigned _numDynSymbols,
diff --git a/include/Polyhedra/Polyhedra.hpp b/include/Polyhedra/Polyhedra.hpp
index c187671e3..6c440c838 100644
--- a/include/Polyhedra/Polyhedra.hpp
+++ b/include/Polyhedra/Polyhedra.hpp
@@ -1,26 +1,24 @@
 #pragma once
 
 #include "Polyhedra/Comparators.hpp"
+#include <Alloc/Arena.hpp>
 #include <Math/Array.hpp>
 #include <Math/Constraints.hpp>
+#include <Math/Constructors.hpp>
 #include <Math/EmptyArrays.hpp>
 #include <Math/Math.hpp>
 #include <Math/VectorGreatestCommonDivisor.hpp>
-#include <Utilities/Allocators.hpp>
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <llvm/Support/raw_ostream.h>
-#include <memory>
 #ifndef NDEBUG
 #include <ostream>
 #endif
 namespace poly::poly {
+using alloc::Arena;
 using math::DensePtrMatrix, math::MutDensePtrMatrix, math::EmptyMatrix,
   math::Row, math::Col, math::vector, math::matrix, math::_, math::end,
-  math::last;
-using utils::Arena;
+  math::last, math::operator<<;
 inline auto printPositive(std::ostream &os, ptrdiff_t stop) -> std::ostream & {
   for (ptrdiff_t i = 0; i < stop; ++i) os << "v_" << i << " >= 0\n";
   return os;
@@ -82,14 +80,14 @@ struct BasePolyhedra {
     if constexpr (HasEqualities) return static_cast<const P *>(this)->getE();
     else return EmptyMatrix<int64_t>();
   }
-  constexpr void truncNumInEqCon(Row r) {
+  constexpr void truncNumInEqCon(Row<> r) {
     static_cast<P *>(this)->truncNumInEqCon(r);
   }
-  constexpr void truncNumEqCon(Row r) {
+  constexpr void truncNumEqCon(Row<> r) {
     if constexpr (HasEqualities) static_cast<P *>(this)->truncNumEqCon(r);
   }
   [[nodiscard]] constexpr auto
-  initializeComparator(std::allocator<int64_t> alloc =
+  initializeComparator(alloc::Mallocator<int64_t> alloc =
                          {}) // NOLINT(performance-unnecessary-value-param)
     -> comparator::LinearSymbolicComparator {
     if constexpr (MaybeNonNeg)
@@ -133,24 +131,24 @@ struct BasePolyhedra {
     pruneBoundsCore<true>(&alloc);
   }
   constexpr void pruneBounds() {
-    utils::OwningArena<> alloc;
+    alloc::OwningArena<> alloc;
     pruneBounds(alloc);
   }
   constexpr void eraseConstraint(ptrdiff_t constraint) {
-    eraseConstraintImpl(getA(), constraint);
+    eraseConstraintImpl(getA(), Row<>{constraint});
     decrementNumConstraints();
   }
   template <bool CheckEmpty> constexpr void pruneBoundsCore(Arena<> *alloc) {
-    auto diff = vector<int64_t>(alloc, unsigned(getA().numCol()));
+    auto diff = vector<int64_t>(alloc, ptrdiff_t(getA().numCol()));
     auto p = checkpoint(alloc);
     const ptrdiff_t dyn = getNumDynamic();
     if constexpr (HasEqualities) {
       auto [ar, er] = removeRedundantRows(getA(), getE());
-      setNumConstraints(unsigned(ar));
-      setNumEqConstraints(unsigned(er));
+      setNumConstraints(ptrdiff_t(ar));
+      setNumEqConstraints(ptrdiff_t(er));
       for (ptrdiff_t i = 0; i < getNumEqualityConstraints(); ++i) {
-        auto l = gcd(getE()(i, _));
-        if (l != 1) getE()(i, _) /= l;
+        auto l = gcd(getE()[i, _]);
+        if (l != 1) getE()[i, _] /= l;
       }
     }
     auto C = initializeComparator(alloc);
@@ -165,7 +163,7 @@ struct BasePolyhedra {
       bool broke = false;
       for (auto i = --j; i;) {
         if (getNumCon() <= 1) return;
-        diff << getA()(--i, _) - getA()(j, _);
+        diff << getA()[--i, _] - getA()[j, _];
         if (C.greaterEqual(*alloc, diff)) {
           eraseConstraint(i);
           rollback(alloc, p);
@@ -182,7 +180,7 @@ struct BasePolyhedra {
       if constexpr (MaybeNonNeg) {
         if (isNonNegative() && !broke) {
           for (ptrdiff_t i = 0; i < dyn; ++i) {
-            diff << getA()(j, _);
+            diff << getA()[j, _];
             --diff[last - i];
             if (C.greaterEqual(*alloc, diff)) {
               eraseConstraint(j);
@@ -229,13 +227,13 @@ struct BasePolyhedra {
     dropEmptyConstraints(getA());
     if constexpr (HasEqualities) dropEmptyConstraints(getE());
   }
-  friend inline auto operator<<(llvm::raw_ostream &os, const BasePolyhedra &p)
-    -> llvm::raw_ostream & {
-    auto &&os2 = printConstraints(os << "\n", p.getA());
+  friend inline auto operator<<(std::ostream &os, const BasePolyhedra &p)
+    -> std::ostream & {
+    printConstraints(os << "\n", p.getA());
     if constexpr (MaybeNonNeg)
-      if (p.isNonNegative()) printPositive(os2, p.getNumDynamic());
-    if constexpr (HasEqualities) return printConstraints(os2, p.getE(), false);
-    return os2;
+      if (p.isNonNegative()) printPositive(os, p.getNumDynamic());
+    if constexpr (HasEqualities) return printConstraints(os, p.getE(), false);
+    return os;
   }
 #ifndef NDEBUG
   [[gnu::used]] void dump() const {
@@ -252,8 +250,8 @@ struct BasePolyhedra {
     // return false;
   }
   void truncateVars(ptrdiff_t numVar) {
-    if constexpr (HasEqualities) getE().truncate(Col{numVar});
-    getA().truncate(Col{numVar});
+    if constexpr (HasEqualities) getE().truncate(Col<>{numVar});
+    getA().truncate(Col<>{numVar});
   }
 };
 } // namespace poly::poly
diff --git a/include/Polyhedra/Schedule.hpp b/include/Polyhedra/Schedule.hpp
index bf50d6ba1..ee2949977 100644
--- a/include/Polyhedra/Schedule.hpp
+++ b/include/Polyhedra/Schedule.hpp
@@ -1,16 +1,14 @@
 #pragma once
 
+#include "Alloc/Arena.hpp"
 #include "Math/Array.hpp"
-#include "Utilities/Allocators.hpp"
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/IR/User.h>
 #include <llvm/Support/raw_ostream.h>
-#include <utility>
 
 namespace poly::poly {
 using math::_, math::PtrVector, math::MutPtrVector, math::SquarePtrMatrix,
@@ -46,11 +44,11 @@ struct AffineSchedule {
 
   constexpr AffineSchedule() : mem(nullptr) {}
   constexpr AffineSchedule(int64_t *m) : mem(m) {}
-  constexpr AffineSchedule(utils::Arena<> *alloc, unsigned nL)
+  constexpr AffineSchedule(alloc::Arena<> *alloc, unsigned nL)
     : mem(alloc->allocate<int64_t>(requiredScheduleStorage(nL))) {
     mem[0] = nL;
   }
-  constexpr auto copy(utils::Arena<> *alloc) const -> AffineSchedule {
+  constexpr auto copy(alloc::Arena<> *alloc) const -> AffineSchedule {
     size_t reqMem = requiredScheduleStorage(getNumLoops());
     AffineSchedule res{alloc->allocate<int64_t>(reqMem)};
     std::copy_n(mem, reqMem, res.mem);
@@ -73,18 +71,18 @@ struct AffineSchedule {
   }
   // NOLINTNEXTLINE(readability-make-member-function-const)
   [[nodiscard]] constexpr auto getPhi() -> MutSquarePtrMatrix<int64_t> {
-    return {data(), math::SquareDims{unsigned(getNumLoops())}};
+    return {data(), math::SquareDims<>{getNumLoops()}};
   }
   [[nodiscard]] constexpr auto getPhi() const -> SquarePtrMatrix<int64_t> {
-    return {data(), math::SquareDims{getNumLoops()}}; //
+    return {data(), math::SquareDims<>{getNumLoops()}}; //
   }
   /// getSchedule, loops are always indexed from outer to inner
   [[nodiscard]] constexpr auto getSchedule(size_t d) const
     -> math::PtrVector<int64_t> {
-    return getPhi()(d, _);
+    return getPhi()[d, _];
   }
   [[nodiscard]] constexpr auto getSchedule(size_t d) -> MutPtrVector<int64_t> {
-    return getPhi()(d, _);
+    return getPhi()[d, _];
   }
   [[nodiscard]] constexpr auto getFusionOmega(size_t i) const -> int64_t {
     return data()[getNumLoopsSquared() + i];
diff --git a/include/Support/Iterators.hpp b/include/Support/Iterators.hpp
index f4d9b4878..9d1063a52 100644
--- a/include/Support/Iterators.hpp
+++ b/include/Support/Iterators.hpp
@@ -1,8 +1,6 @@
 #pragma once
 #include <Math/Array.hpp>
 #include <Utilities/ListRanges.hpp>
-#include <bits/iterator_concepts.h>
-#include <bits/ranges_base.h>
 #include <cstdint>
 #include <ranges>
 
@@ -18,8 +16,8 @@ class VCycleIterator {
 public:
   using value_type = int32_t;
   constexpr VCycleIterator() noexcept = default;
-  constexpr VCycleIterator(const int32_t *data, int32_t start) noexcept
-    : data(data), state(start), start(start), dobreak(start < 0) {}
+  constexpr VCycleIterator(const int32_t *data_, int32_t start_) noexcept
+    : data(data_), state(start_), start(start_), dobreak(start_ < 0) {}
   constexpr auto operator*() const noexcept -> int32_t { return state; }
   constexpr auto operator++() noexcept -> VCycleIterator & {
     state = data[state];
@@ -64,10 +62,10 @@ class VCycleRange : public std::ranges::view_interface<VCycleRange> {
   int32_t start;
 
 public:
-  constexpr VCycleRange(math::PtrVector<int32_t> data, int32_t start) noexcept
-    : data(data.begin()), start(start) {}
-  constexpr VCycleRange(const int32_t *data, int32_t start) noexcept
-    : data(data), start(start) {}
+  constexpr VCycleRange(math::PtrVector<int32_t> data_, int32_t start_) noexcept
+    : data(data_.begin()), start(start_) {}
+  constexpr VCycleRange(const int32_t *data_, int32_t start_) noexcept
+    : data(data_), start(start_) {}
 
   [[nodiscard]] constexpr auto begin() const noexcept -> VCycleIterator {
     return {data, start};
@@ -76,19 +74,24 @@ class VCycleRange : public std::ranges::view_interface<VCycleRange> {
 };
 static_assert(std::ranges::forward_range<VCycleRange>);
 
+/// VForwardIterator is safe with respect to removing the current iteration from
+/// the list. However, behavior is undefined if you remove or move the next
+/// element.
 class VForwardIterator {
   const int32_t *data{nullptr};
   int32_t state{-1};
+  int32_t next{-1};
 
 public:
   using value_type = int32_t;
   constexpr VForwardIterator() noexcept = default;
-  constexpr VForwardIterator(const int32_t *data, int32_t start) noexcept
-    : data(data), state(start) {}
+  constexpr VForwardIterator(const int32_t *data_, int32_t start_) noexcept
+    : data{data_}, state{start_}, next{start_ < 0 ? start_ : data_[start_]} {}
 
   constexpr auto operator*() const noexcept -> int32_t { return state; }
   constexpr auto operator++() noexcept -> VForwardIterator & {
-    state = data[state];
+    state = next;
+    if (next >= 0) next = data[next];
     return *this;
   }
   constexpr auto operator++(int) noexcept -> VForwardIterator {
@@ -129,10 +132,11 @@ class VForwardRange : public std::ranges::view_interface<VForwardRange> {
   int32_t start;
 
 public:
-  constexpr VForwardRange(math::PtrVector<int32_t> data, int32_t start) noexcept
-    : data(data.begin()), start(start) {}
-  constexpr VForwardRange(const int32_t *data, int32_t start) noexcept
-    : data(data), start(start) {}
+  constexpr VForwardRange(math::PtrVector<int32_t> data_,
+                          int32_t start_) noexcept
+    : data(data_.begin()), start(start_) {}
+  constexpr VForwardRange(const int32_t *data_, int32_t start_) noexcept
+    : data(data_), start(start_) {}
 
   [[nodiscard]] constexpr auto begin() const noexcept -> VForwardIterator {
     return {data, start};
diff --git a/include/Support/OStream.hpp b/include/Support/OStream.hpp
index 1671b0830..e004194a3 100644
--- a/include/Support/OStream.hpp
+++ b/include/Support/OStream.hpp
@@ -13,7 +13,8 @@ template <typename T>
 inline auto operator<<(llvm::raw_ostream &os, PtrVector<T> const &A)
   -> llvm::raw_ostream & {
   std::ostringstream sos;
-  return os << printVector(sos, A).str();
+  printVector(sos, A);
+  return os << sos.str();
 }
 inline auto operator<<(llvm::raw_ostream &os, const AbstractVector auto &A)
   -> llvm::raw_ostream & {
@@ -25,17 +26,18 @@ template <typename T>
 inline auto operator<<(llvm::raw_ostream &os, PtrMatrix<T> A)
   -> llvm::raw_ostream & {
   std::ostringstream sos;
-  return os << printMatrix(sos, A).str();
+  printMatrix(sos, A);
+  return os << sos.str();
 }
 template <typename T>
-inline auto operator<<(llvm::raw_ostream &os, Array<T, SquareDims> A)
-  -> std::ostream & {
-  return printMatrix(os, PtrMatrix<T>{A});
+inline auto operator<<(llvm::raw_ostream &os, Array<T, SquareDims<>> A)
+  -> llvm::raw_ostream & {
+  return os << PtrMatrix<T>{A};
 }
 template <typename T>
-inline auto operator<<(llvm::raw_ostream &os, Array<T, DenseDims> A)
-  -> std::ostream & {
-  return printMatrix(os, PtrMatrix<T>{A});
+inline auto operator<<(llvm::raw_ostream &os, Array<T, DenseDims<>> A)
+  -> llvm::raw_ostream & {
+  return os << PtrMatrix<T>{A};
 }
 } // namespace math
 namespace utils {
diff --git a/include/TurboLoop.hpp b/include/TurboLoop.hpp
index 9ca9ed33f..af15d79f2 100644
--- a/include/TurboLoop.hpp
+++ b/include/TurboLoop.hpp
@@ -67,14 +67,15 @@ concept LoadOrStoreInst =
   std::same_as<llvm::StoreInst, std::remove_cvref_t<T>>;
 
 class TurboLoop {
-  dict::map<llvm::Loop *, IR::Loop *> loopMap;
-  // const llvm::TargetLibraryInfo *TLI;
+  const llvm::TargetLibraryInfo *TLI;
   const llvm::TargetTransformInfo *TTI;
   llvm::LoopInfo *LI;
   llvm::ScalarEvolution *SE;
   llvm::OptimizationRemarkEmitter *ORE;
   lp::LoopBlock loopBlock{};
   IR::Cache instructions{};
+  dict::set<llvm::BasicBlock *> loopBBs;
+  dict::set<llvm::CallBase *> eraseCandidates;
   CostModeling::CPURegisterFile registers;
 
   // this is an allocator that it is safe to reset completely when
@@ -163,6 +164,7 @@ class TurboLoop {
         instructions.addPredicate(A, P, &(*predMapAbridged));
         A->setLoopNest(AL);
       }
+      loopBBs.insert(BB);
     }
     return IR::mergeInstructions(instructions, *predMapAbridged, *TTI,
                                  *shortAllocator(),
@@ -181,8 +183,7 @@ class TurboLoop {
     // we'd have to make sure none of the allocated instructions
     // can be referenced again (e.g., through the free list)
     // auto p = lalloc.checkpoint();
-    NotNull<poly::Loop> AL =
-      poly::Loop::construct(lalloc, L, nwr.visit(BT), *SE);
+    Valid<poly::Loop> AL = poly::Loop::construct(lalloc, L, nwr.visit(BT), *SE);
     IR::TreeResult tr = parseExitBlocks(L);
     tr.rejectDepth = std::max(tr.rejectDepth, omega.size() - AL->getNumLoops());
     omega.push_back(0); // we start with 0 at the end, walking backwards
@@ -248,7 +249,7 @@ class TurboLoop {
   /// large nest.
   ///
   /// If any of the subloops fail, or we fail to draw the connection, then we
-  /// can optimize the continuous succesful block we've produced, and return a
+  /// can optimize the continuous successful block we've produced, and return a
   /// failure up the tree.
   ///
   ///
@@ -331,9 +332,14 @@ class TurboLoop {
 
   void optimize(IR::TreeResult tr) {
     // now we build the LinearProgram
-    lp::OptimizationResult lpor = loopBlock.optimize(instructions, tr);
+    lp::LoopBlock::OptimizationResult lpor =
+      loopBlock.optimize(instructions, tr);
     if (!lpor.nodes) return;
-    CostModeling::optimize(instructions, loopBlock.getAllocator(), lpor);
+    for (IR::Addr *addr : lpor.addr.getAddr())
+      loopBBs.insert(addr->getBasicBlock());
+    CostModeling::optimize(loopBlock.getDependencies(), instructions, loopBBs,
+                           eraseCandidates, loopBlock.getAllocator(), lpor);
+    loopBBs.clear();
   }
   /*
     auto isLoopPreHeader(const llvm::BasicBlock *BB) const -> bool {
@@ -367,7 +373,8 @@ class TurboLoop {
   // }
 public:
   TurboLoop(llvm::Function &F, llvm::FunctionAnalysisManager &FAM)
-    : TTI{&FAM.getResult<llvm::TargetIRAnalysis>(F)},
+    : TLI{&FAM.getResult<llvm::TargetLibraryAnalysis>(F)},
+      TTI{&FAM.getResult<llvm::TargetIRAnalysis>(F)},
       LI{&FAM.getResult<llvm::LoopAnalysis>(F)},
       SE{&FAM.getResult<llvm::ScalarEvolutionAnalysis>(F)},
       ORE{&FAM.getResult<llvm::OptimizationRemarkEmitterAnalysis>(F)},
diff --git a/test/ArrayReference.hpp b/test/ArrayReference.hpp
index 33ec953fd..f33b917b1 100644
--- a/test/ArrayReference.hpp
+++ b/test/ArrayReference.hpp
@@ -1,14 +1,15 @@
 #pragma once
 #include "IR/Address.hpp"
 #include "Math/Math.hpp"
+#include "Math/MatrixDimensions.hpp"
 #include "Polyhedra/Loops.hpp"
 #include <cstdint>
 #include <llvm/Support/Allocator.h>
 
 namespace poly {
 
-using math::DenseMatrix, math::PtrMatrix, math::MutPtrMatrix, utils::Arena,
-  math::PtrVector, utils::NotNull;
+using math::DenseMatrix, math::PtrMatrix, math::MutPtrMatrix, alloc::Arena,
+  math::PtrVector, math::DenseDims, utils::Valid;
 
 struct ArrayReference {
   const llvm::SCEVUnknown *basePointer;
@@ -16,7 +17,7 @@ struct ArrayReference {
   DenseMatrix<int64_t> indMat;
   DenseMatrix<int64_t> offMat;
   llvm::SmallVector<const llvm::SCEV *, 3> sizes;
-  ArrayReference(const llvm::SCEVUnknown *p, poly::Loop *l, size_t dim)
+  ArrayReference(const llvm::SCEVUnknown *p, poly::Loop *l, unsigned dim)
     : basePointer(p), loop(l), indMat(DenseDims{loop->getNumLoops(), dim}),
       offMat(DenseDims{dim, 1}), sizes(dim) {
     indexMatrix() << 0;
@@ -32,12 +33,13 @@ struct ArrayReference {
     return ptrdiff_t(offMat.numRow());
   }
 };
-inline auto createMemAccess(Arena<> *alloc, ArrayReference &ar,
-                            llvm::Instruction *IC, PtrVector<unsigned> omegas)
-  -> NotNull<IR::Addr> {
+// inline auto createMemAccess(Arena<> *alloc, ArrayReference &ar,
+//                             llvm::Instruction *IC, PtrVector<unsigned>
+//                             omegas)
+//   -> Valid<IR::Addr> {
 
-  IntMatrix indMatT(ar.indMat.transpose());
-  return IR::Addr::construct(alloc, ar.basePointer, *ar.loop, IC, indMatT,
-                             {ar.sizes, {}}, ar.offsetMatrix(), omegas);
-}
+//   math::IntMatrix indMatT{ar.indMat.t()};
+//   return IR::Addr::construct(alloc, ar.basePointer, *ar.loop, IC, indMatT,
+//                              {ar.sizes, {}}, ar.offsetMatrix(), omegas);
+// }
 } // namespace poly
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bf694dbe6..12581edd0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -7,6 +7,7 @@ project(LoopModelsTests LANGUAGES CXX)
 option(ENABLE_TEST_COVERAGE "Enable test coverage" OFF)
 option(TEST_INSTALLED_VERSION "Test the version found by find_package" OFF)
 option(ENABLE_LLD "Use lld for linking" ON)
+option(TEST_LOOPMODELS "Test LoopModels" OFF) # ON FIXME
 
 # --- Import tools ----
 
@@ -19,6 +20,12 @@ include(../cmake/CPM.cmake)
 # ---- compile_commands.json ----
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if(((USE_SANITIZER MATCHES "([Aa]ddress)") OR (USE_SANITIZER MATCHES "([Aa]ddress);([Uu]ndefined)"))
+   AND (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+)
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lunwind -Wno-unused-command-line-argument")
+endif()
+
 # CPMAddPackage("gh:onqtam/doctest@2.4.9")
 CPMAddPackage("gh:TheLartians/Format.cmake@1.7.3")
 CPMAddPackage(
@@ -40,10 +47,12 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(Math)
 
-if(TEST_INSTALLED_VERSION)
-  find_package(LoopModels REQUIRED)
-else()
-  add_subdirectory(.. LoopModels)
+if(TEST_LOOPMODELS)
+  if(TEST_INSTALLED_VERSION)
+    find_package(LoopModels REQUIRED)
+  else()
+    add_subdirectory(.. LoopModels)
+  endif()
 endif()
 
 # ---- Create binary ----
@@ -53,25 +62,21 @@ file(
   GLOB
   tests
   CONFIGURE_DEPENDS
-  ${CMAKE_CURRENT_SOURCE_DIR}/bitset_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/bumpmap_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/comparator_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/compat_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/dependence_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/graph_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/linear_algebra_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/linear_diophantine_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/matrix_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/normal_form_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/orthogonalize_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/remarks_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/simplex_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/string_to_intmat_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/unimodularization_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/dict_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/bitset_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/bumpmap_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/comparator_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/compat_test.cpp #
+  # ${CMAKE_CURRENT_SOURCE_DIR}/dependence_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/graph_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/linear_algebra_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/linear_diophantine_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/matrix_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/normal_form_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/orthogonalize_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/remarks_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/simplex_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/string_to_intmat_test.cpp
+  # ${CMAKE_CURRENT_SOURCE_DIR}/unimodularization_test.cpp
 )
 
 # list(FILTER tests EXCLUDE REGEX "remarks.*") for remarks test
-find_package(LLVM 16 REQUIRED CONFIG)
+find_package(LLVM 17 REQUIRED CONFIG)
 list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR})
 # include(AddLLVM)
 include(${LLVM_DIR}/AddLLVM.cmake)
@@ -88,7 +93,6 @@ target_precompile_headers(
   ${PROJECT_NAME}
   PRIVATE
   <llvm/ADT/ArrayRef.h>
-  <llvm/ADT/Optional.h>
   <llvm/ADT/SmallVector.h>
   <llvm/Analysis/LoopInfo.h>
   <llvm/Analysis/ScalarEvolution.h>
@@ -137,7 +141,7 @@ target_link_libraries(
   ${PROJECT_NAME} PRIVATE GTest::gtest_main LLVM unordered_dense::unordered_dense Math
 )
 set(CXX_STANDARD_REQUIRED ON)
-set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
+set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 23)
 set_target_properties(
   ${PROJECT_NAME} PROPERTIES ENVIRONMENT WORKING_DIRECTORY=${PROJECT_BINARY_DIR}
 )
@@ -176,26 +180,35 @@ endif()
 if(NOT TEST_INSTALLED_VERSION)
   if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
     # -Werror?
-    target_compile_options(LoopModels PUBLIC -Wall -Wpedantic -Wextra -Wshadow)
+    if(TEST_LOOPMODELS)
+      target_compile_options(LoopModels PUBLIC -Wall -Wpedantic -Wextra -Wshadow)
+    endif()
     target_compile_options(${PROJECT_NAME} PUBLIC -Wall -Wpedantic -Wextra -Wshadow)
-  elseif(MSVC)
+  elseif(MSVC and TEST_LOOPMODELS)
     target_compile_options(LoopModels PUBLIC /W4 /WX)
   endif()
 endif()
 # target_compile_options(LoopModels PRIVATE -D_GLIBCXX_DEBUG) target_compile_options(${PROJECT_NAME}
 # PRIVATE -D_GLIBCXX_DEBUG)
-target_compile_options(LoopModels PRIVATE -D_GLIBCXX_ASSERTIONS)
+
+if(TEST_LOOPMODELS)
+  target_compile_options(LoopModels PRIVATE -D_GLIBCXX_ASSERTIONS)
+endif()
 target_compile_options(${PROJECT_NAME} PRIVATE -D_GLIBCXX_ASSERTIONS)
 if(ENABLE_LLD)
   target_link_options(${PROJECT_NAME} PRIVATE -fuse-ld=lld)
-  target_link_options(LoopModels PRIVATE -fuse-ld=lld)
+  if(TEST_LOOPMODELS)
+    target_link_options(LoopModels PRIVATE -fuse-ld=lld)
+  endif()
 endif()
 # ---- code coverage ----
 
 message(STATUS "ENABLE_TEST_COVERAGE: ${ENABLE_TEST_COVERAGE}")
 if(ENABLE_TEST_COVERAGE)
-  target_compile_options(LoopModels PUBLIC -O0 -g --coverage)
-  target_link_options(LoopModels PUBLIC --coverage)
+  if(TEST_LOOPMODELS)
+    target_compile_options(LoopModels PUBLIC -O0 -g --coverage)
+    target_link_options(LoopModels PUBLIC --coverage)
+  endif()
   target_compile_options(${PROJECT_NAME} PUBLIC -O0 -g --coverage)
   target_link_options(${PROJECT_NAME} PUBLIC --coverage)
   add_custom_target(
diff --git a/test/TestUtilities.hpp b/test/TestUtilities.hpp
index 1de5f585a..b52ca71f4 100644
--- a/test/TestUtilities.hpp
+++ b/test/TestUtilities.hpp
@@ -1,10 +1,9 @@
 #pragma once
 #include "Polyhedra/Loops.hpp"
-#include <Utilities/Allocators.hpp>
+#include <Alloc/Arena.hpp>
 #include <cstdint>
 #include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/ADT/SmallVector.h>
-#include <llvm/ADT/Triple.h>
 #include <llvm/Analysis/AssumptionCache.h>
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/Analysis/ScalarEvolution.h>
@@ -24,7 +23,7 @@ namespace poly {
 using math::PtrMatrix;
 
 class TestLoopFunction {
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   llvm::LLVMContext ctx;
   llvm::Module *mod;
   llvm::LoopInfo LI{};
@@ -33,7 +32,7 @@ class TestLoopFunction {
   llvm::Function *F;
   llvm::DataLayout dl;
   llvm::TargetTransformInfo TTI;
-  llvm::Triple targetTripple{};
+  llvm::Triple targetTriple{};
   llvm::TargetLibraryInfo TLI;
   llvm::AssumptionCache AC;
   llvm::ScalarEvolution SE;
@@ -45,7 +44,7 @@ class TestLoopFunction {
   size_t ptrIntOffset{0};
 
 public:
-  auto getAlloc() -> utils::Arena<> * { return &alloc; }
+  auto getAlloc() -> alloc::Arena<> * { return &alloc; }
   auto getLoopNest(size_t i) -> poly::Loop * { return alns[i]; }
   auto getNumLoopNests() -> size_t { return alns.size(); }
   void addLoop(PtrMatrix<int64_t> A, size_t numLoops) {
@@ -93,7 +92,7 @@ class TestLoopFunction {
                                  llvm::SmallVector<llvm::Type *, 0>(), false)},
       F{llvm::Function::Create(
         FT, llvm::GlobalValue::LinkageTypes::ExternalLinkage, "foo", mod)},
-      dl{mod}, TTI{dl}, TLI{llvm::TargetLibraryInfoImpl{targetTripple}, F},
+      dl{mod}, TTI{dl}, TLI{llvm::TargetLibraryInfoImpl{targetTriple}, F},
       AC{*F, &TTI}, SE{*F, TLI, AC, DT, LI},
       BB{llvm::BasicBlock::Create(ctx, "entry", F)},
       builder{llvm::IRBuilder(BB)} {
diff --git a/test/bumpmap_test.cpp b/test/bumpmap_test.cpp
index 326e4a928..44b0be6af 100644
--- a/test/bumpmap_test.cpp
+++ b/test/bumpmap_test.cpp
@@ -1,4 +1,5 @@
 #include "Dicts/BumpMapSet.hpp"
+#include <cstdint>
 #include <gtest/gtest.h>
 
 // // NOLINTNEXTLINE(modernize-use-trailing-return-type)
@@ -13,9 +14,22 @@
 // }
 // NOLINTNEXTLINE(modernize-use-trailing-return-type)
 TEST(BumpDownMapTest, BasicAssertions) {
-  OwningArena<> alloc;
+  using M = poly::dict::amap<uint64_t, uint64_t>;
+  static_assert(
+    std::same_as<
+      M::value_container_type,
+      poly::math::BumpPtrVector<containers::Pair<uint64_t, uint64_t>>>);
+  static_assert(
+    std::same_as<
+      M::allocator_type,
+      poly::alloc::WArena<containers::Pair<uint64_t, uint64_t>, 16384, true>>);
+
+  poly::alloc::OwningArena<> alloc;
+  M::allocator_type walloc{&alloc};
+  M::value_container_type mvals{walloc};
+  // poly::math::BumpPtrVector<int> vec{&alloc};
   for (int i = 0; i < 100; ++i) {
-    poly::dict::amap<uint64_t, uint64_t> map{&alloc};
+    M map{&alloc};
     for (int j = 0; j < 100; ++j) map.insert({j, j});
     for (int j = 0; j < 100; ++j) EXPECT_EQ(map.find(j)->second, j);
     alloc.reset();
diff --git a/test/comparator_test.cpp b/test/comparator_test.cpp
index 4f42507c5..63fe63821 100644
--- a/test/comparator_test.cpp
+++ b/test/comparator_test.cpp
@@ -23,7 +23,7 @@ TEST(BasicCompare, BasicAssertions) {
   //    0  0 -1 1 0
   //    0  0 -1 0 1 ]
   IntMatrix A = "[-1 0 1 0 0; 0 -1 1 0 0; 0 0 -1 1 0; 0 0 -1 0 1]"_mat;
-  auto comp = poly::comparator::linear(std::allocator<int64_t>{}, A,
+  auto comp = poly::comparator::linear(alloc::Mallocator<int64_t>{}, A,
                                        EmptyMatrix<int64_t>{}, false);
   Vector<int64_t> query{{-1, 0, 0, 1, 0}};
 
@@ -98,7 +98,7 @@ TEST(V2Matrix, BasicAssertions) {
   // 0 0 0 1 0 0 -1 0; 0 0 0 0 0 0 0 0 1 0 0 1; 0 0 0 0 0 0 0 0 0 1 0 0]"_mat;
   auto comp = poly::comparator::LinearSymbolicComparator::construct(A, false);
   auto [H, U] = NormalForm::hermite(std::move(A));
-  IntMatrix Ht = H.transpose();
+  IntMatrix Ht = H.t();
   // llvm::errs() << "Ht matrix:" << Ht << "\n";
   auto Vt = IntMatrix::identity(Ht.numRow());
   auto NS = NormalForm::nullSpace(Ht);
diff --git a/test/compat_test.cpp b/test/compat_test.cpp
index 2cbbca17c..b3b1813d3 100644
--- a/test/compat_test.cpp
+++ b/test/compat_test.cpp
@@ -2,6 +2,7 @@
 #include "Polyhedra/Loops.hpp"
 #include "Support/OStream.hpp"
 #include "TestUtilities.hpp"
+#include "Utilities/Valid.hpp"
 #include <Math/Constraints.hpp>
 #include <Math/Math.hpp>
 #include <Utilities/MatrixStringParse.hpp>
@@ -14,7 +15,7 @@
 
 namespace poly {
 
-using math::IntMatrix, utils::operator""_mat;
+using math::IntMatrix, math::DenseMatrix, utils::operator""_mat;
 
 // NOLINTNEXTLINE(modernize-use-trailing-return-type)
 TEST(TrivialPruneBounds0, BasicAssertions) {
@@ -94,7 +95,7 @@ TEST(LessTrivialPruneBounds, BasicAssertions) {
   poly::Loop &aff = *tlf.getLoopNest(0);
 
   aff.pruneBounds();
-  llvm::errs() << "LessTrival test Bounds pruned:\n";
+  llvm::errs() << "LessTrivial test Bounds pruned:\n";
 #ifndef NDEBUG
   aff.dump();
 #endif
@@ -148,9 +149,9 @@ TEST(AffineTest0, BasicAssertions) {
 #endif
   llvm::errs() << "About to run first set of bounds tests\n";
   llvm::errs() << "\nPermuting loops 1 and 2\n";
-  utils::OwningArena<> allocator;
-  utils::NotNull<poly::Loop> affp021ptr{
-    aff.rotate(allocator, "[1 0 0; 0 0 1; 0 1 0]"_mat, nullptr)};
+  alloc::OwningArena<> allocator;
+  utils::Valid<poly::Loop> affp021ptr{
+    aff.rotate(&allocator, "[1 0 0; 0 0 1; 0 1 0]"_mat, nullptr)};
   poly::Loop &affp021 = *affp021ptr;
   // Now that we've swapped loops 1 and 2, we should have
   // for m in 0:M-1, k in 1:N-1, n in 0:k-1
@@ -164,9 +165,9 @@ TEST(AffineTest0, BasicAssertions) {
                << "\n";
   llvm::errs() << "Constructed affine obj\n";
   llvm::errs() << "About to run first compat test\n";
-  EXPECT_FALSE(affp021.zeroExtraItersUponExtending(tlf.getAlloc(), 1, false));
+  EXPECT_FALSE(affp021.zeroExtraItersUponExtending(*tlf.getAlloc(), 1, false));
   llvm::errs() << "About to run second compat test\n";
-  EXPECT_TRUE(affp021.zeroExtraItersUponExtending(tlf.getAlloc(), 1, true));
+  EXPECT_TRUE(affp021.zeroExtraItersUponExtending(*tlf.getAlloc(), 1, true));
 
   // affp021.zeroExtraIterationsUponExtending(poset, 1, )
 }
@@ -198,8 +199,9 @@ TEST(NonUnimodularExperiment, BasicAssertions) {
   tlf.addLoop(std::move(B), 2);
   poly::Loop &aff2 = *tlf.getLoopNest(tlf.getNumLoopNests() - 1);
   EXPECT_FALSE(aff2.isEmpty());
-  OwningArena<> allocator;
-  NotNull<poly::Loop> affp10{aff2.rotate(allocator, "[0 1; 1 0]"_mat, nullptr)};
+  alloc::OwningArena<> allocator;
+  utils::Valid<poly::Loop> affp10{
+    aff2.rotate(&allocator, "[0 1; 1 0]"_mat, nullptr)};
 
   llvm::errs() << "Swapped order:\n";
 #ifndef NDEBUG
diff --git a/test/dependence_test.cpp b/test/dependence_test.cpp
index e738f487e..b6d8fbaed 100644
--- a/test/dependence_test.cpp
+++ b/test/dependence_test.cpp
@@ -129,7 +129,7 @@ TEST(DependenceTest, BasicAssertions) {
   Vector<unsigned, 4> schLoad0(3, 0);
   Vector<unsigned, 4> schStore(3, 0);
   schStore[2] = 2;
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   IR::Addr *msrc{createMemAccess(&alloc, srcA, storeA11, schStore)};
   IR::Addr *mtgt01{createMemAccess(&alloc, tgtA01, loadA01, schLoad0)};
   poly::DepPoly *dep0{poly::DepPoly::dependence(&alloc, *msrc, *mtgt01)};
@@ -256,7 +256,7 @@ TEST(SymmetricIndependentTest, BasicAssertions) {
   Vector<unsigned, 4> schLoad(3, 0);
   Vector<unsigned, 4> schStore(3, 0);
   schStore[2] = 1;
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   IR::Addr *msrc{createMemAccess(&alloc, srcA, storeAij, schStore)};
   IR::Addr *mtgt{createMemAccess(&alloc, tgtA, loadAji, schLoad)};
   poly::DepPoly *dep{poly::DepPoly::dependence(&alloc, *msrc, *mtgt)};
@@ -341,7 +341,7 @@ TEST(RankDeficientLoad, BasicAssertions) {
   Vector<unsigned, 4> schLoad(2 + 1, 0);
   Vector<unsigned, 4> schStore(2 + 1, 0);
   schStore[2] = 1;
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   IR::Addr *msrc{createMemAccess(&alloc, srcA, storeAij, schStore)};
   IR::Addr *mtgt{createMemAccess(&alloc, tgtA, loadAii, schLoad)};
 
@@ -437,7 +437,7 @@ TEST(TimeHidingInRankDeficiency, BasicAssertions) {
   Vector<unsigned, 4> schLoad(3 + 1, 0);
   Vector<unsigned, 4> schStore(3 + 1, 0);
   schStore[3] = 1;
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   IR::Addr *msrc{createMemAccess(&alloc, refA, storeA, schStore)};
   IR::Addr *mtgt{createMemAccess(&alloc, refA, loadA, schLoad)};
 
@@ -555,12 +555,12 @@ TEST(TriangularExampleTest, BasicAssertions) {
   // badly written triangular solve:
   // for (m = 0; m < M; ++m){
   //   for (n = 0; n < N; ++n){
-  //     A(n,m) = B(n,m);
+  //     A[n,m] = B[n,m];
   //   }
   //   for (n = 0; n < N; ++n){
-  //     A(n,m) = A(n,m) / U(n,n);
+  //     A[n,m] = A[n,m] / U[n,n];
   //     for (k = n+1; k < N; ++k){
-  //       A(k,m) = A(k,m) - A(n,m)*U(k,n);
+  //       A[k,m] = A[k,m] - U[k,n]*A[n,m];
   //     }
   //   }
   // }
@@ -652,7 +652,7 @@ TEST(TriangularExampleTest, BasicAssertions) {
   IR::AddrChain addr;
   Vector<unsigned, 4> sch2t0t0(2 + 1, 0);
   Vector<unsigned, 4> sch2t0t1{sch2t0t0};
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   // A(n,m) = -> B(n,m) <-
   IR::Addr *mSch2t0t0(createMemAccess(&alloc, indBmn, loadB, sch2t0t0));
   addr.addAddr(mSch2t0t0);
@@ -1140,7 +1140,7 @@ TEST(MeanStDevTest0, BasicAssertions) {
   lp::LoopBlock iOuterLoopNest;
   llvm::SmallVector<IR::Addr *> iOuterMem;
 
-  utils::OwningArena<> alloc;
+  alloc::OwningArena<> alloc;
   iOuterMem.emplace_back(createMemAccess(&alloc, xInd1, storeX0, sch0t0)); // 0
 
   iOuterMem.emplace_back(
@@ -1446,7 +1446,7 @@ TEST(DoubleDependenceTest, BasicAssertions) {
   EXPECT_TRUE(loopBlock.optimize().has_value());
   EXPECT_EQ(loopBlock.numEdges(), 2);
   map<IR::Addr *, size_t> memAccessIds;
-  for (size_t jj = 0; jj < loopBlock.numIR::Addres(); ++jj)
+  for (size_t jj = 0; jj < loopBlock.numIR::Address(); ++jj)
     memAccessIds[loopBlock.getIR::Addr(jj)] = jj;
   for (auto &e : loopBlock.getEdges()) {
     auto [in, out] = e.getInOutPair();
diff --git a/test/dict_test.cpp b/test/dict_test.cpp
new file mode 100644
index 000000000..f08324c8d
--- /dev/null
+++ b/test/dict_test.cpp
@@ -0,0 +1,122 @@
+#include "Dicts/Trie.hpp"
+#include <Alloc/Arena.hpp>
+#include <ankerl/unordered_dense.h>
+#include <gtest/gtest.h>
+#include <random>
+
+using poly::dict::TrieMap, poly::dict::InlineTrie;
+
+// NOLINTNEXTLINE(modernize-use-trailing-return-type)
+TEST(TrieTest, BasicAssertions) {
+  std::mt19937_64 rng;
+  poly::alloc::OwningArena<> alloc{};
+
+  TrieMap<true, int, int> d;
+  EXPECT_FALSE(d.find(3));
+  d[&alloc, 3] = 11;
+  EXPECT_EQ(d.find(3)->second, 11);
+  d[&alloc, 3] += 11;
+  EXPECT_EQ(d.find(3)->second, 22);
+
+  InlineTrie<int, int> t;
+  EXPECT_FALSE(t.find(7));
+  t[alloc, 7] = 13;
+  EXPECT_TRUE(t.find(7));
+  EXPECT_EQ(*t.find(7), 13);
+  t[alloc, 7] += 14;
+  EXPECT_EQ(*t.find(7), 27);
+  //// More thorough test:
+  TrieMap<true, void *, uint64_t> tm;
+  InlineTrie<void *, uint64_t> it;
+  ankerl::unordered_dense::map<void *, uint64_t> m;
+
+  // uint64_t mask = ((1ULL << 5) - 1) << 4ULL;
+  uint64_t mask = ((1ULL << 10) - 1) << 4ULL;
+  bool found = false;
+  // static constexpr auto debugval = 0xc38;
+  static constexpr auto debugval = 0x3c00;
+  // static constexpr auto debugval = 0x1358;
+  // static constexpr auto debugval = 0x12e8;
+  for (uint64_t i = 0; i < 512;) {
+    void *x = reinterpret_cast<void *>(rng() & mask);
+    if (!x) continue;
+    void *y = reinterpret_cast<void *>(rng() & mask);
+    if (!y) continue;
+    if (reinterpret_cast<uintptr_t>(x) == debugval) {
+      found = true;
+      auto *tmf = tm.find(y);
+      auto itf = it.find(y);
+      auto *tmfx = tm.find(x);
+      auto itfx = it.find(x);
+      std::cout << "i = " << i + 1 << "; m[y] = " << m[y] << "\n"
+                << "tm.find(y) = " << (tmf ? tmf->second : 0)
+                << "\nit.find(y) = " << (itf ? *itf : 0)
+                << "\ntm.find(x) = " << (tmfx ? tmfx->second : -1)
+                << "\nit.find(x) = " << (itfx ? *itfx : -1)
+                << "\ntm[a, x] = " << tm[&alloc, x]
+                << "\nit[a, x] = " << it[&alloc, x]
+                << "\ntm.find(x) = " << tm.find(x)->second
+                << "\nit.find(x) = " << *it.find(x) << "\n";
+    }
+    if (found) {
+      void *p = reinterpret_cast<void *>(debugval);
+      EXPECT_EQ(m[p], tm.find(p)->second);
+      EXPECT_EQ(m[p], *it.find(p));
+      ASSERT(m[p] == tm.find(p)->second);
+      ASSERT(m[p] == *it.find(p));
+    }
+    m[x] += (++i) + m[y];
+    tm[&alloc, x] += i + tm[&alloc, y];
+    it[&alloc, x] += i + it[&alloc, y];
+    if (reinterpret_cast<uintptr_t>(x) == debugval) {
+      auto *tmf = tm.find(x);
+      auto itf = it.find(x);
+      std::cout << "i = " << i << "; m[x] = " << m[x] << "\n"
+                << "tm.find(x) = " << (tmf ? tmf->second : -1)
+                << "\nit.find(x) = " << (itf ? *itf : -1) << "\n";
+    }
+    EXPECT_TRUE(tm.find(x));
+    EXPECT_TRUE(it.find(x));
+    if (tm.find(x)->second != m[x]) std::cout << "x = " << x << "\n";
+    if (*it.find(x) != m[x]) std::cout << "x = " << x << "\n";
+    EXPECT_EQ(tm.find(x)->second, m[x]);
+    EXPECT_EQ(*it.find(x), m[x]);
+    void *z = reinterpret_cast<void *>(rng() & mask);
+    if (!z) continue;
+    // std::cout << "i = " << i << "\n";
+    if (found) {
+      void *p = reinterpret_cast<void *>(debugval);
+      EXPECT_EQ(m[p], tm.find(p)->second);
+      EXPECT_EQ(m[p], *it.find(p));
+      ASSERT(m[p] == tm.find(p)->second);
+      ASSERT(m[p] == *it.find(p));
+    }
+    if (void *p = reinterpret_cast<void *>(debugval); p == z) {
+      auto *tmf = tm.find(z);
+      auto itf = it.find(z);
+      std::cout << "i = " << i << "; m[z] = " << m[z] << "\n"
+                << "tm.find(z) = " << (tmf ? tmf->second : -1)
+                << "\nit.find(z) = " << (itf ? *itf : -1) << "\n";
+    }
+    m.erase(z);
+    tm.erase(z);
+    it.erase(z);
+    EXPECT_FALSE(tm.find(z));
+    EXPECT_FALSE(it.find(z));
+    if (reinterpret_cast<void *>(debugval) == z) found = false;
+    if (found) {
+      void *p = reinterpret_cast<void *>(debugval);
+      EXPECT_EQ(m[p], tm.find(p)->second);
+      EXPECT_EQ(m[p], *it.find(p));
+      ASSERT(m[p] == tm.find(p)->second);
+      ASSERT(m[p] == *it.find(p));
+    }
+  }
+  for (auto [k, v] : m) {
+    // std::cout << "k = " << k << "; v = " << v << "\n";
+    EXPECT_TRUE(tm.find(k));
+    EXPECT_TRUE(it.find(k));
+    EXPECT_EQ(tm.find(k)->second, v);
+    EXPECT_EQ(*it.find(k), v);
+  }
+}
diff --git a/test/orthogonalize_test.cpp b/test/orthogonalize_test.cpp
index 91efde138..a2254a7f3 100644
--- a/test/orthogonalize_test.cpp
+++ b/test/orthogonalize_test.cpp
@@ -11,7 +11,6 @@
 #include <cstdint>
 #include <gtest/gtest.h>
 #include <llvm/ADT/SmallVector.h>
-#include <llvm/ADT/Triple.h>
 #include <llvm/Analysis/AssumptionCache.h>
 #include <llvm/Analysis/ScalarEvolution.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
@@ -30,10 +29,10 @@ using math::DenseMatrix, math::DenseDims, math::PtrMatrix, math::MutPtrMatrix,
   math::Col, math::end, math::_, utils::operator""_mat;
 
 namespace {
-auto orthogonalize(utils::Arena<> *alloc,
+auto orthogonalize(alloc::Arena<> *alloc,
                    llvm::SmallVectorImpl<ArrayReference *> const &ai)
   -> std::optional<
-    std::pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>>> {
+    containers::Pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>>> {
 
   // need to construct matrix `A` of relationship
   // B*L = I
@@ -66,14 +65,13 @@ auto orthogonalize(utils::Arena<> *alloc,
   // now, we have (A = alnp.aln->A, r = alnp.aln->r)
   // (A*K')*J <= r
   DenseMatrix<int64_t> AK{alnp.getA()};
-  AK(_, _(numSymbols, end))
-    << alnp.getA()(_, _(numSymbols, end)) * K.transpose();
+  AK(_, _(numSymbols, end)) << alnp.getA()(_, _(numSymbols, end)) * K.t();
 
   auto *alnNew =
     poly::Loop::construct(alloc, nullptr, std::move(AK), alnp.getSyms(), true);
   alnNew->pruneBounds();
   math::IntMatrix KS{K * S};
-  std::pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>> ret{
+  containers::Pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>> ret{
     std::make_pair(alnNew, llvm::SmallVector<ArrayReference, 0>())};
   llvm::SmallVector<ArrayReference, 0> &newArrayRefs = ret.second;
   newArrayRefs.reserve(numRow);
@@ -116,7 +114,7 @@ TEST(OrthogonalizeTest, BasicAssertions) {
   const llvm::SCEVUnknown *scevB = tlf.getSCEVUnknown(tlf.createArray());
   // we have three array refs
   // W[i+m, j+n]
-  // llvm::SmallVector<std::pair<MPoly,MPoly>>
+  // llvm::SmallVector<containers::Pair<MPoly,MPoly>>
   ArrayReference War{scevW, aln, 2};
   {
     MutPtrMatrix<int64_t> indMat = War.indexMatrix();
@@ -159,7 +157,8 @@ TEST(OrthogonalizeTest, BasicAssertions) {
   llvm::SmallVector<ArrayReference *> ai{
     allArrayRefs.data(), allArrayRefs.data() + 1, allArrayRefs.data() + 2};
 
-  std::optional<std::pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>>>
+  std::optional<
+    containers::Pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>>>
     orth(orthogonalize(tlf.getAlloc(), ai));
 
   EXPECT_TRUE(orth.has_value());
@@ -281,7 +280,8 @@ TEST(BadMul, BasicAssertions) {
   llvm::SmallVector<ArrayReference *> ai{
     allArrayRefs.data(), allArrayRefs.data() + 1, allArrayRefs.data() + 2};
 
-  std::optional<std::pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>>>
+  std::optional<
+    containers::Pair<poly::Loop *, llvm::SmallVector<ArrayReference, 0>>>
     orth{orthogonalize(tlf.getAlloc(), ai)};
 
   EXPECT_TRUE(orth.has_value());
@@ -328,7 +328,7 @@ TEST(OrthogonalizeMatricesTest, BasicAssertions) {
     // llvm::errs() << "Orthogonal A =\n" << A << "\n";
     // note, A'A is not diagonal
     // but AA' is
-    B = A * A.transpose();
+    B = A * A.t();
     // llvm::errs() << "A'A =\n" << B << "\n";
 #if !defined(__clang__) && defined(__GNUC__)
 #pragma GCC diagnostic push
diff --git a/tools/prettyprinters.py b/tools/prettyprinters.py
index 9d041f8e9..50eb504cb 100644
--- a/tools/prettyprinters.py
+++ b/tools/prettyprinters.py
@@ -99,21 +99,21 @@ def __init__(self, val):
         )
 
 pp = gdb.printing.RegexpCollectionPrettyPrinter("LoopModels")
-pp.add_printer("poly::math::Array", "^poly::math::Array<.*, unsigned int>$", VectorPrinter)
-pp.add_printer("poly::math::::ManagedArray", "^poly::math::ManagedArray<.*, unsigned int, .*, std::allocator<.*>, .*>$", VectorPrinter)
+pp.add_printer("poly::math::Array", "^poly::math::Array<.*, ptrdiff_t>$", VectorPrinter)
+pp.add_printer("poly::math::::ManagedArray", "^poly::math::ManagedArray<.*, ptrdiff_t, .*, alloc::Mallocator<.*>, .*>$", VectorPrinter)
 pp.add_printer(
     "poly::math::::Array",
-    "^poly::math::::Array<.*, poly::math::::SquareDims>$",
+    "^poly::math::::Array<.*, poly::math::::SquareDims<>>$",
     SquareMatrixPrinter,
 )
 pp.add_printer(
     "poly::math::::Array",
-    "^poly::math::::Array<.*, poly::math::::DenseDims>$",
+    "^poly::math::::Array<.*, poly::math::::DenseDims<>>$",
     DenseMatrixPrinter,
 )
 pp.add_printer(
     "poly::math::::Array",
-    "^poly::math::::Array<.*, poly::math::::StridedDims>$",
+    "^poly::math::::Array<.*, poly::math::::StridedDims<>>$",
     StridedMatrixPrinter,
 )
 gdb.printing.register_pretty_printer(gdb.current_objfile(), pp)