diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index a450ac1b..ba5863f3 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -12,20 +12,17 @@ jobs:
     runs-on: ubuntu-latest
     steps:
         - name: Checkout repository
-          uses: actions/checkout@v3.5.2
+          uses: actions/checkout@v4
           with:
             submodules: 'recursive'
-        - name: Step Python
-          uses: actions/setup-python@v4.6.0
+
+        - name: Setup Python 3.11
+          uses: actions/setup-python@v5
           with:
-            python-version: '3.11.7'
-        - name: Install OpenMPI for gt4py
-          run: |
-            sudo apt-get install libopenmpi-dev
-        - name: Install Python packages
-          run: |
-            python -m pip install --upgrade pip setuptools wheel
-            pip install .[develop]
+            python-version: '3.11'
+
+        - name: Install pre-commit
+          run: pip install pre-commit
+
         - name: Run lint via pre-commit
-          run: |
-            pre-commit run --all-files
+          run: pre-commit run --all-files
diff --git a/.gitignore b/.gitignore
index 35923df0..86441c02 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,6 +153,7 @@ dmypy.json
 
 # GT4Py
 **/.gt_cache*/
+**/.gt4py_cache/
 
 # Tests
 .my_cache_path/*
@@ -169,7 +170,6 @@ RESTART/
 _dacegraphs
 
 **/testing/output/*
-*.png
 *.nc
 
 # VSCode
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 00000000..03d48f39
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,190 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   Copyright 2022 Allen Institute for Artificial Intelligence (AI2)
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 3554660c..3ea17add 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,9 @@ NDSL submodules `gt4py` and `dace` to point to vetted versions, use `git clone -
 NDSL is __NOT__ available on `pypi`. Installation of the package has to be local, via `pip install ./NDSL` (`-e` supported). The packages has a few options:
 
 - `ndsl[test]`: installs the test packages (based on `pytest`)
-- `ndsl[develop]`: installs tools for development and tests.
+- `ndsl[demos]`: installs extra requirements to run [NDSL exmpales](./examples/NDSL/)
+- `ndsl[docs]`: installs extra requirements to build the docs
+- `ndsl[develop]`: installs tools for development, docs, and tests.
 
 Tests are available via:
 
@@ -45,11 +47,28 @@ For GPU backends (the above plus):
 
 ## Development
 
-TBD: Code/contribution guideline
+### Code/contribution guidelines
 
-TBD: Documentation
+TBD
 
-Point of Contacts:
+### Documentation
+
+We are using [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/), which allows us to write the docs in Markdown files and optionally serve it as a static site.
+
+To view the documentation, install NDSL with the `docs` or `develop` extras. Then just run
+
+```bash
+mkdocs serve
+```
+
+Contributing to the documentation is straight forward:
+
+1. Add and/or change files in the [docs/](./docs/) folder as necessary.
+2. [Optional] If you have changes to the navigation, modify [mkdocs.yml](mkdocs.yml).
+3. [Optional] Start the development server and look how your changes are rendered.
+4. Submit a pull request with your changes.
+
+## Point of Contacts
 
 - NOAA: Rusty Benson: rusty.benson -at- noaa.gov
 - NASA: Florian Deconinck florian.g.deconinck -at- nasa.gov
diff --git a/docs/dev/dace.md b/docs/dev/dace.md
new file mode 100644
index 00000000..6c44cad2
--- /dev/null
+++ b/docs/dev/dace.md
@@ -0,0 +1,5 @@
+# DaCe
+
+[DaCe](https://spcldace.readthedocs.io/en/latest/index.htm) is is the full-program optimization framework used in NDSL. DaCe is short for Data-Centric Parallel Programming and developed at ETH's scalable parallel computing lab (SPCL).
+
+In NDSL, DaCe powers the [performance backends](https://geos-esm.github.io/SMT-Nebulae/technical/backend/dace-bridge/) of [GT4Py](./gt4py.md). In particular, in NDSL's orchestration feature we will encode [macro-level optimizations](https://geos-esm.github.io/SMT-Nebulae/technical/backend/ADRs/stree/) like loop re-ordering and stencil fusing using DaCe.
diff --git a/docs/dev/gt4py.md b/docs/dev/gt4py.md
new file mode 100644
index 00000000..b334831f
--- /dev/null
+++ b/docs/dev/gt4py.md
@@ -0,0 +1,5 @@
+# GT4Py
+
+!!! warning
+
+    TODO: Add some docs on GT4Py here
diff --git a/docs/dev/index.md b/docs/dev/index.md
new file mode 100644
index 00000000..91b7ca70
--- /dev/null
+++ b/docs/dev/index.md
@@ -0,0 +1,107 @@
+# Under the hood
+
+This is the technical part of the documentation, geared towards developers contributing to NDSL.
+
+## Introduction
+
+Recently, Python has became the dominant programming language in the machine learning and data sciences communities since it is easy to learn and program. However, the performance of Python is still a major concern in scientific computing and HPC community. In the scientific computing and HPC community, the most widely used programming languages are C/C++ and Fortran, Python is often used as script language for pre- and post-processing.
+
+The major performance issue in Python programming language, especially in computation-intensive applications, are loops, which are often the performance bottlenecks of an application in other programming languages too, such as C++ and Fortran. However, Python programs are often observed to be 10x to 100x slower than C, C++ and Fortran programs. In order to achieve peak hardware performance, the scientific computing communities have tried different programming models, such as OpenMP, Cilk+, and Thread Building Blocks (TBB), as well as Linux p-threads for multi/many-core processors and GPUs, Kokkos, RAJA, OpenMP offload, and OpenACC for highest performance on CPU/GPUs heterogeneous system. All of these programming models are only available for C, C++ and Fortran. Only a few work that target to high performance for Python programming language.
+
+The Python based NDSL programming model described in this developer’s guide provides an alternative solution to reach peak hardware performance with relatively little programming effort by using the stencil semantics. A stencil is similar to parallel for kernels that are used in Kokkos and RAJA, to update array elements according to a fixed access pattern. With the stencil semantics in mind, NDSL, for example, can be used to write matrix multiplication kernels that match the performance of cuBLAS/hipBLAS that many GPU programmers can’t do in Cuda/HiP using only about 30 lines of code. It greatly reduces the programmer’s effort, and NDSL has already been successfully used in the Pace global climate model, which achieves up to 4x speedup, more efficient than the original Fortran implementations.
+
+## Programming model
+
+The programming model of NDSL is composed of backend execution spaces, performance optimization pass and transformations, and memory spaces, memory layout. These abstraction semantics allow the formulation of generic algorithms and data structures which can then be mapped to different types of hardware architectures. Effectively, they allow for compile time transformation of algorithms to allow for adaptions of varying degrees of hardware parallelism as well as of the memory hierarchy. Figure 1 shows the high level architecture of NDSL (without orchestration option), From Fig. 1, it is shown that NDSL uses hierarchy levels intermediate representation (IR) to abstract the structure of computational program, which reduces the complexity of application code, and maintenance cost, while the code portability and scalability are increased. This method also avoids raising the information from lower level representations by means of static analysis, and memory leaking, where feasible, and performing optimizations at the high possible level of abstraction. The methods primarily leverages structural information readily available in the source code, it enables to apply the optimization, such as loop fusion, tiling and vectorization without the need for complicated analysis and heuristics.
+
+![NDSL flow](../images/dev/ndsl_flow.png)
+
+In NDSL, the python frontend code takes the user defined stencils to python AST using builtin ast module. In an AST, each node is an object defined in python AST grammar class (for more details, please refer: https://docs.python.org/3/library/ast.html). the AST node visitor (the NDSL/external/gt4py/src/gt4py/cartesian/frontend/gtscript_frontend.py) IRMaker class traverses the AST of a python function decorated by @gtscript.function and/or stencil objects, the Python AST of the program is then lowing to the Definition IR. The definition IR is high level IR, and is composed of high level program, domain-specific information, and the structure of computational operations which are independent of low level hardware platform. The definition of high level IR allows transformation of the IRs without loosing the performance of numerical libraries. However, the high level IR doesn’t contains detailed information that required for performance on specific low level runtime hardware. Specifically, the definition IR only preserves the necessary information to lower operations to runtime platform hardware instructions implementing coarse-grained vector operations, or to numerical libraries — such as cuBLAS/hipBLAS and Intel MKL.
+
+The definition IR is then transformed to GTIR (gt4py/src/gt4py/cartesian/frontend/defir_to_gtir.py), the GTIR stencils is defined as in NDSL
+
+```python
+class Stencil(LocNode, eve.ValidatedSymbolTableTrait):
+    name: str
+    api_signature: List[Argument]
+    params: List[Decl]
+    vertical_loops: List[VerticalLoop]
+    externals: Dict[str, Literal]
+    sources: Dict[str, str]
+    docstring: str
+
+    @property
+    def param_names(self) -> List[str]:
+        return [p.name for p in self.params]
+
+    _validate_lvalue_dims = common.validate_lvalue_dims(VerticalLoop, FieldDecl)
+```
+
+GTIR is also a high level IR, it contains vertical_loops loop statement, in the climate applications, the vertical loops usually need special treatment as the numerical unstanbility is a reason. The vertical_loops in GTIR as separate code block and help the following performance pass and transformation implementation. The program analysis pass/transformation is applied on the GTIR to remove the redundant nodes, and pruning the unused parameters, and data type and shape propagations of the symbols, and loop extensions.
+
+The GTIR is then further lowered to optimization IR (OIR), which is defined as
+
+```python
+class Stencil(LocNode, eve.ValidatedSymbolTableTrait):
+    name: str
+    # TODO: fix to be List[Union[ScalarDecl, FieldDecl]]
+    params: List[Decl]
+    vertical_loops: List[VerticalLoop]
+    declarations: List[Temporary]
+
+    _validate_dtype_is_set = common.validate_dtype_is_set()
+    _validate_lvalue_dims = common.validate_lvalue_dims(VerticalLoop, FieldDecl)
+```
+
+The OIR is particularly designed for performance optimization, the performance optimization algorithm are carried out on OIR by developing pass/transformations. Currently, the vertical loop merging, and horizontal execution loop merging, and loop unrolling and vectorization, statement fusion and pruning optimizations are available and activated by the environmental variable in the oir_pipeline module.
+
+After the optimization pipeline finished, the OIR is then converted to different backend IR, for example, DACE IR (SDFG). The DACE SDFG can be further optimized by its embedded pass/transformations algorithm, but in PACE application, we didn’t activate this optimization step. It should be pointed out that, during the OIR to SDFG process, the horizontal execution node is serialized to SDFG library node, within which the loop expansion information is encrypted.
+
+When using GT backend, the OIR is then directly used by the gt4py code generator to generate the C++ GridTools stencils (computation code), and the python binding code. In this backend, each horizontal execution node will be passed to and generate a separate GridTools stencil.
+
+NDSL also supports the whole program optimization model, this is called orchestration model in NDSL, currently it only supports DaCe backend. Whole program optimization with DaCe is the process of turning all Python and GT4Py code in generated C++. Only _orchestrate_ the runtime code of the model is applied, e.g. everything in the __call__ method of the module and all code in __init__ is executed like a normal GT backend.
+
+At the highest level in Pace, to turn on orchestration you need to flip the FV3_DACEMODE to an orchestrated options _and_ run a dace:* backend (it will error out if run anything else). Option for FV3_DACEMODE are:
+
+- _Python_: default, turns orchestration off.
+- _Build_: build the SDFG then exit without running. See Build for limitation of build strategy.
+- _BuildAndRun_: as above, but distribute the build and run.
+- _Run_: tries to execute, errors out if the cache don’t exists.
+
+Code is orchestrated two ways:
+
+- functions are orchestrated via orchestrate_function decorator,
+- methods are orchestrate via the orchestrate function (e.g. pace.driver.Driver._critical_path_step_all)
+
+The later is the way we orchestrate in our model. orchestrate is often called as the first function in the __init__. It patches _in place_ the methods and replace them with a wrapper that will deal with turning it all into executable SDFG when call time comes.
+
+The orchestration has two parameters: config (will expand later) and dace_compiletime_args.
+
+DaCe needs to be described all memory so it can interface it in the C code that will be executed. Some memory is automatically parsed (e.g. numpy, cupy, scalars) and others need description. In our case Quantity and others need to be flag as dace.compiletime which tells DaCe to not try to AOT the memory and wait for JIT time. The dace_compiletime_args helps with tagging those without having to change the type hint.
+
+Figure 2 shows the hierarchy levels of intermediate representations (IR) and the lowing process when orchestration option is activated.
+
+![NDSL orchestration](../images/dev/ndsl_orchestration.png)
+
+When the orchestrated option is turned on, the call method object is patched in place, replacing the original Callable with a wrapper that will trigger orchestration at call time. If the model configuration doesn’t demand orchestration, this won’t do anything. The orchestrated call methods and the computational stencils (lazy computational stencils) which are cached in a container, will be parsed to python AST by the frontend code during the runtime, then the python AST code will be converted to DaCe SDFG. The analysis and optimization will be applied before the C++ code is generated by the codegen, this process is called Just In Time (JIT) build, compared with the non-orchestration model, which is eagerly compiled and build. The JIT build caches the build information of computational stencils, and orchestrated methods, and it is more convenient to apply the analysis and optimization pass to the overall code, such as the merging of neighbor stencils made easy. Therefore, more optimized code can be generated, and better performance can be achieved during runtime.
+
+## Analysis and Optimization
+
+One of the major features of NDSL is that users can develop a new pass/transformation for the backend with new hardware, the passes and/or transformations are the key integrates in order to have good performance on the new hardware. In different abstract level, the passes and/or transformations perform different levels of optimization. For example, the loop level of optimization is independent of hardware, and can be applied to any backend, while the optimization of device placement, and memory and caches optimizations are dependent on different backend and hardware. In this section, we only focused on the optimizations that are independent of the backend hardware.
+
+The general procedure of code optimization has two steps, in the first step, a filter function is called to find the pattern that need to apply the pass and/or transformation, then apply the pass and/or transformation to the filtered pattern to insert or delete or replace the existing node with the optimized node. In NDSL, the following passes and/transformations are provided.
+
+```python
+def prune_unused_parameters(node: gtir.Stencil) -> gtir.Stencil:
+      assert isinstance(node, gtir.Stencil)
+      used_variables = (
+        node.walk_values()
+        .if_isinstance(gtir.FieldAccess, gtir.ScalarAccess)
+        .getattr("name")
+        .to_list()
+      )
+      used_params = list(filter(lambda param: param.name in used_variables, node.params))
+      return node.copy(update={"params": used_params})
+```
+
+## Code generators
diff --git a/docs/images/dev/ndsl_flow.png b/docs/images/dev/ndsl_flow.png
new file mode 100644
index 00000000..37a8d435
Binary files /dev/null and b/docs/images/dev/ndsl_flow.png differ
diff --git a/docs/images/dev/ndsl_orchestration.png b/docs/images/dev/ndsl_orchestration.png
new file mode 100644
index 00000000..507e618d
Binary files /dev/null and b/docs/images/dev/ndsl_orchestration.png differ
diff --git a/docs/images/translate/image1.png b/docs/images/translate/image1.png
new file mode 100644
index 00000000..529d55ea
Binary files /dev/null and b/docs/images/translate/image1.png differ
diff --git a/docs/images/translate/image2.png b/docs/images/translate/image2.png
new file mode 100644
index 00000000..b73ea3f0
Binary files /dev/null and b/docs/images/translate/image2.png differ
diff --git a/docs/images/translate/image3.png b/docs/images/translate/image3.png
new file mode 100644
index 00000000..784aa36d
Binary files /dev/null and b/docs/images/translate/image3.png differ
diff --git a/docs/images/translate/image4.png b/docs/images/translate/image4.png
new file mode 100644
index 00000000..3ec73a86
Binary files /dev/null and b/docs/images/translate/image4.png differ
diff --git a/docs/images/translate/image5.png b/docs/images/translate/image5.png
new file mode 100644
index 00000000..0b4b90d9
Binary files /dev/null and b/docs/images/translate/image5.png differ
diff --git a/docs/includes/glossary.md b/docs/includes/glossary.md
new file mode 100644
index 00000000..c40936cc
--- /dev/null
+++ b/docs/includes/glossary.md
@@ -0,0 +1,24 @@
+<!-- institutions / groups / teams -->
+
+*[CSCS]: Swiss National Supercomputing Center
+*[ETH]: Swiss Federal Institute of Technology
+*[GFDL]: Geophysical Fluid Dynamics Laboratory
+*[NASA]: National Aeronautics and Space Administration
+*[NOAA]: National Oceanic and Atmospheric Administration
+*[SPCL]: Scalable Parallel Computing Laboratory (ETH Zurich)
+
+
+<!-- technology -->
+
+*[DSL]: Domain specific language
+*[FORTRAN]: Old programming language
+*[IR]: Intermedite Representation: An abstraction between source code and machine code, designed to simplify analysis and optimization during program compilation.
+*[NDSL]: NOAA/NASA Domain Specific Language middleware
+*[SDFG]: Stateful Dataflow multiGraphs - the IR of DaCe
+
+<!-- Modeling -->
+*[FMS]: Flexible Modeling System - see https://github.com/NOAA-GFDL/FMS
+*[FV3]: GFDL Finite­-Volume Cubed-Sphere Dynamical Core
+
+<!-- other -->
+*[ULP]: Unit in the last place: The smallest allowed difference between two floating-point numbers.
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 00000000..24053f27
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,116 @@
+# NDSL Documentation
+
+NDSL allows atmospheric scientists to write focus on what matters in model development and hides away the complexities of coding for a super computer.
+
+## Quick Start
+
+Python `3.11.x` is required for NDSL and all its third party dependencies for installation.
+
+NDSL submodules `gt4py` and `dace` to point to vetted versions, use `git clone --recurse-submodule` to update the git submodules.
+
+NDSL is **NOT** available on `pypi`. Installation of the package has to be local, via `pip install ./NDSL` (`-e` supported). The packages have a few options:
+
+- `ndsl[test]`: installs the test packages (based on `pytest`)
+- `ndsl[develop]`: installs tools for development and tests.
+
+NDSL uses pytest for its unit tests, the tests are available via:
+
+- `pytest -x test`: running CPU serial tests (GPU as well if `cupy` is installed)
+- `mpirun -np 6 pytest -x test/mpi`: running CPU parallel tests (GPU as well if `cupy` is installed)
+
+## Requirements & supported compilers
+
+For CPU backends:
+
+- 3.11.x >= Python < 3.12.x
+- Compilers:
+  - GNU 11.2+
+- Libraries:
+  - Boost headers 1.76+ (no lib installed, just headers)
+
+For GPU backends (the above plus):
+
+- CUDA 11.2+
+- Python package:
+  - `cupy` (latest with proper driver support [see install notes](https://docs.cupy.dev/en/stable/install.html))
+- Libraries:
+  - MPI compiled with cuda support
+
+## NDSL installation and testing
+
+NDSL is not available at `pypi`, it uses
+
+```bash
+pip install NDSL
+```
+
+to install NDSL locally.
+
+NDSL has a few options:
+
+- `ndsl[test]`: installs the test packages (based on `pytest`)
+- `ndsl[develop]`: installs tools for development and tests.
+
+Tests are available via:
+
+- `pytest -x test`: running CPU serial tests (GPU as well if `cupy` is installed)
+- `mpirun -np 6 pytest -x test/mpi`: running CPU parallel tests (GPU as well if `cupy` is installed)
+
+## Configurations for Pace
+
+Configurations for Pace to use NDSL with different backend:
+
+- FV3_DACEMODE=Python[Build|BuildAndRun|Run] controls the full program optimizer behavior
+
+  - Python: default, use stencil only, no full program optimization
+
+  - Build: will build the program then exit. This _build no matter what_. (backend must be `dace:gpu` or `dace:cpu`)
+
+  - BuildAndRun: same as above but after build the program will keep executing (backend must be `dace:gpu` or `dace:cpu`)
+
+  - Run: load pre-compiled program and execute, fail if the .so is not present (_no hash check!_) (backend must be `dace:gpu` or `dace:cpu`)
+
+- PACE_FLOAT_PRECISION=64 control the floating point precision throughout the program.
+
+Install Pace with different NDSL backend:
+
+- Shell scripts to install Pace using NDSL backend on specific machines such as Gaea can be found in `examples/build_scripts/`.
+- When cloning Pace you will need to update the repository's submodules as well:
+
+```bash
+git clone --recursive https://github.com/ai2cm/pace.git
+```
+
+  or if you have already cloned the repository:
+
+```bash
+git submodule update --init --recursive
+```
+
+- Pace requires GCC > 9.2, MPI, and Python 3.8 on your system, and CUDA is required to run with a GPU backend.
+You will also need the headers of the boost libraries in your `$PATH` (boost itself does not need to be installed).
+If installed outside the standard header locations, gt4py requires that `$BOOST_ROOT` be set:
+
+```bash
+cd BOOST/ROOT
+wget https://boostorg.jfrog.io/artifactory/main/release/1.79.0/source/boost_1_79_0.tar.gz
+tar -xzf boost_1_79_0.tar.gz
+mkdir -p boost_1_79_0/include
+mv boost_1_79_0/boost boost_1_79_0/include/
+export BOOST_ROOT=BOOST/ROOT/boost_1_79_0
+```
+
+- We recommend creating a python `venv` or conda environment specifically for Pace.
+
+```bash
+python3 -m venv venv_name
+source venv_name/bin/activate
+```
+
+- Inside of your pace `venv` or conda environment pip install the Python requirements, GT4Py, and Pace:
+
+```bash
+pip3 install -r requirements_dev.txt -c constraints.txt
+```
+
+- There are also separate requirements files which can be installed for linting (`requirements_lint.txt`) and building documentation   (`requirements_docs.txt`).
diff --git a/docs/porting/index.md b/docs/porting/index.md
new file mode 100644
index 00000000..51459663
--- /dev/null
+++ b/docs/porting/index.md
@@ -0,0 +1,87 @@
+# Notes on porting FORTRAN code
+
+This part of the documentation includes notes about porting FORTRAN code to NDSL.
+
+## General Concepts
+
+Since we are not trying to do model developing but rather replicate an existing model, the main philosophy is to replicate model behavior as precisely as possible.
+Since weather and climate models can take diverging paths based on very small input differences, as described in [\[1\]][1], a bitwise reproducible code is impossible to achieve.
+There were attempts at solving this problem like shown in [\[2\]][2] or [\[3\]][3] but all of those require heavy modification to the original code.
+In our case, the switch from the original FORTRAN environment to a C++ environment can already contribute to these small errors showing up and therefore a 1:1 validation on a large scale is impossible.
+This effect gets further enhanced by computation on GPUs.
+Lastly the mixing of precisions found in various models is often done slightly unmethodical and can further complicate the understand of what precision is required where.
+
+Since large scale validation is therefore close to impossible, we are trying to get reproducible results (within a margin) on smaller sub-components of the model.
+When porting code, we therefore try to break down larger components into logical, numerically coherent substructures that can be tested and validated individually.
+This breakdown serves two main purposes:
+
+1. Give us confidence, that the ported code behaves as intended.
+2. Allow us to monitor if or how performance optimization down the road changes the numerical results of our model components.
+
+## Porting Guidelines
+
+Since GT4Py has certain restrictions on what can be in the same stencil and what needs to be in separate stencils, there is no absolute 1:1 mapping that can or should be applied.
+
+The best practices we found are:
+
+1. A numerically self-contained module should always live in a single class.
+2. If possible, try to isolate individual numerical motifs into functions.
+
+### Example
+
+To illustrate best practices, we show a stripped version of the the nonhydrostatic vertical solver on the C-grid (Also know as the Riemann Solver):
+
+#### Main definition
+
+```python
+class NonhydrostaticVerticalSolverCGrid:
+    def __init__(self, ...):
+        # Definition of the (potentially multiple) stencils to call
+        self._precompute_stencil = stencil_factory.from_origin_domain(
+            precompute,
+            origin=origin,
+            domain=domain,
+        )
+        self._compute_sim1_solve = stencil_factory.from_origin_domain(
+            sim1_solver,
+            origin=origin,
+            domain=domain,
+        )
+        # Definition of temporary variables share across two stencils
+        # that are not used outside the module
+        self._pfac = FloatFieldIJ()
+        ...
+    def __call__(self, cappa: FloatField, delpc: FloatField):
+        self._precompute_stencil(cappa, _pfac)
+        self._compute_sim1_solve(_pfac, delpc)
+```
+
+#### Stencil Definitions
+
+```python
+#constants definition
+c1 = Float(-2.0) / Float(14.0)
+c2 = Float(11.0) / Float(14.0)
+c3 = Float(5.0) / Float(14.0)
+
+#function for numerical standalone motif
+@gtscript.function
+def vol_conserv_cubic_interp_func_y(v):
+    return c1 * v[0, -2, 0] + c2 * v[0, -1, 0] + c3 * v
+
+def precompute(cappa: FloatField, _pfac: FloatFieldIJ):
+    # small computation directly in the stencil
+    with computation(PARALLEL), interval(...):
+        # a variable used only in one stencil can be defined here
+        tmpvar = cappa[1,0,0] + 1
+    with computation(PARALLEL), interval(0, 1):
+        _pfac = tmpvar[0,0,1]
+
+def sim1_solver(cappa: FloatField, _pfac: FloatFieldIJ):
+    with computation(PARALLEL), interval(...):
+        cappa = vol_conserv_cubic_interp_func_y(cappa) + _pfac
+```
+
+[1]: <https://www.climate.gov/news-features/blogs/enso/butterflies-rounding-errors-and-chaos-climate-models> "Chaos in climate models"
+[2]: <https://pasc17.org/fileadmin/user_upload/pasc17/program/post125s2.pdf> "Reproducible Climate Simulations"
+[3]: <http://htor.inf.ethz.ch/sec/bitrep-ipdps.pdf> "Bit reproducible HPC applications"
diff --git a/docs/porting/translate/index.md b/docs/porting/translate/index.md
new file mode 100644
index 00000000..1aa083bd
--- /dev/null
+++ b/docs/porting/translate/index.md
@@ -0,0 +1,60 @@
+# Translate tests
+
+We call tests that validate subsets of computation against serialized data "translate tests". These should provide a baseline with which we can validate ported code and ensure the pipeline generates expected results.
+
+## The Translate infrastructure
+
+The infrastructure is set up in a way that for basic cases, all the default implementations are enough:
+
+The `TranslateFortranData2Py` base class will be evaluated through the function `test_sequential_savepoint`.
+The general structure is:
+
+1. Extract tolerances for errors - either the defaults or the overwritten ones:
+    - Maximal absolute error
+    - Maximal relative error
+    - Allowed ULP difference
+2. Extract input data from `{savepoint_name}-In.nc`
+3. Run the `compute` function, returning the outputs.
+4. Extract reference output data from `{savepoint_name}-Out.nc`
+5. Compare the data in `out_vars` to the reference data.
+
+For these steps to work, the name of the translate test needs to match the name of the data.
+In case of special handling required, almost everything can be overwritten:
+
+### Overwriting thresholds
+
+You can create an overwrite file to manually set the threshold in you data directory:
+
+![image1.png](../../images/translate/image1.png)
+
+### Overwriting Arguments to your compute function
+
+The compute_func will be called automatically in the test. If your names in the netcdf are matching the `kwargs` of your function directly, no further action required:
+
+![image2.png](../../images/translate/image2.png)
+
+If you need to rename it from the netcdf, you can use ["serialname"]:
+
+![image3.png](../../images/translate/image3.png)
+
+The same applies for scalar inputs with parameters:
+
+![image4.png](../../images/translate/image4.png)
+
+### Modifying output variables
+
+This can be required either if not all output is serialized, the naming is different or we need the same data as the input:
+
+![image4.png](../../images/translate/image4.png)
+
+### Modifying the `compute` function
+
+Normally, compute has the three steps:
+
+1. setup input
+2. call `compute_func`
+3. slice outputs
+
+Slight adaptations to every step are possible:
+
+![image5.png](../../images/translate/image5.png)
diff --git a/docs/user/index.md b/docs/user/index.md
new file mode 100644
index 00000000..292d3953
--- /dev/null
+++ b/docs/user/index.md
@@ -0,0 +1,3 @@
+# Usage documentation
+
+This part of the documentation is geared towards users of NDSL.
diff --git a/external/dace b/external/dace
index da644fe8..13402cbf 160000
--- a/external/dace
+++ b/external/dace
@@ -1 +1 @@
-Subproject commit da644fe8c179022fe8e730fb3f47f6399f1db4ce
+Subproject commit 13402cbfeeb6969cbd3915acfb7a30bdb543071b
diff --git a/external/gt4py b/external/gt4py
index 0ddddd37..1ba0a972 160000
--- a/external/gt4py
+++ b/external/gt4py
@@ -1 +1 @@
-Subproject commit 0ddddd37d3056ad6518f33908eb02f3b1f992878
+Subproject commit 1ba0a97282037a6756f5da23d207a362383e5743
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 00000000..09916f21
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,53 @@
+site_name: NDSL Documentation
+
+theme:
+  name: material
+  features:
+    - search.suggest
+    - search.highlight
+    - search.share
+
+nav:
+  - Home: index.md
+  - User documentation: user/index.md
+  - Porting:
+    - General Concepts: porting/index.md
+    - Testing Infrastructure: porting/translate/index.md
+  - Under the hood:
+    - Technical Documentation: dev/index.md
+    - DaCe: dev/dace.md
+    - GT4Py: dev/gt4py.md
+
+
+markdown_extensions:
+  # simple glossary file
+  - abbr
+  # support for colored notes / warnings / tips / examples
+  - admonition
+  # support for footnotes
+  - footnotes
+  # support for syntax highlighting
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets:
+      auto_append:
+        # hover tooltips for abbreviations (simple glossary)
+        - docs/includes/glossary.md
+  - pymdownx.superfences:
+      custom_fences:
+      # support for mermaid graphs
+      - name: mermaid
+        class: mermaid
+        format: python/name:pymdownx.superfences.fence_code_format
+  # image inclusion
+
+plugins:
+  # add search box to the header, configuration in theme
+  - search
+
+watch:
+  # reload when the glossary file is updated
+  - docs/includes
diff --git a/ndsl/boilerplate.py b/ndsl/boilerplate.py
index a777cd82..41b34820 100644
--- a/ndsl/boilerplate.py
+++ b/ndsl/boilerplate.py
@@ -44,7 +44,7 @@ def _get_factories(
 
     compilation_config = CompilationConfig(
         backend=backend,
-        rebuild=True,
+        rebuild=False,
         validate_args=True,
         format_source=False,
         device_sync=False,
diff --git a/ndsl/comm/mpi.py b/ndsl/comm/mpi.py
index 6b3ff17f..3c466950 100644
--- a/ndsl/comm/mpi.py
+++ b/ndsl/comm/mpi.py
@@ -6,7 +6,6 @@
 from typing import Dict, List, Optional, TypeVar, cast
 
 from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
-from ndsl.logging import ndsl_log
 
 
 T = TypeVar("T")
@@ -43,70 +42,46 @@ def Get_size(self) -> int:
         return self._comm.Get_size()
 
     def bcast(self, value: Optional[T], root=0) -> T:
-        ndsl_log.debug("bcast from root %s on rank %s", root, self._comm.Get_rank())
         return self._comm.bcast(value, root=root)
 
     def barrier(self):
-        ndsl_log.debug("barrier on rank %s", self._comm.Get_rank())
         self._comm.barrier()
 
     def Barrier(self):
         pass
 
     def Scatter(self, sendbuf, recvbuf, root=0, **kwargs):
-        ndsl_log.debug("Scatter on rank %s with root %s", self._comm.Get_rank(), root)
         self._comm.Scatter(sendbuf, recvbuf, root=root, **kwargs)
 
     def Gather(self, sendbuf, recvbuf, root=0, **kwargs):
-        ndsl_log.debug("Gather on rank %s with root %s", self._comm.Get_rank(), root)
         self._comm.Gather(sendbuf, recvbuf, root=root, **kwargs)
 
     def allgather(self, sendobj: T) -> List[T]:
-        ndsl_log.debug("allgather on rank %s", self._comm.Get_rank())
         return self._comm.allgather(sendobj)
 
     def Send(self, sendbuf, dest, tag: int = 0, **kwargs):
-        ndsl_log.debug("Send on rank %s with dest %s", self._comm.Get_rank(), dest)
         self._comm.Send(sendbuf, dest, tag=tag, **kwargs)
 
     def sendrecv(self, sendbuf, dest, **kwargs):
-        ndsl_log.debug("sendrecv on rank %s with dest %s", self._comm.Get_rank(), dest)
         return self._comm.sendrecv(sendbuf, dest, **kwargs)
 
     def Isend(self, sendbuf, dest, tag: int = 0, **kwargs) -> Request:
-        ndsl_log.debug("Isend on rank %s with dest %s", self._comm.Get_rank(), dest)
         return self._comm.Isend(sendbuf, dest, tag=tag, **kwargs)
 
     def Recv(self, recvbuf, source, tag: int = 0, **kwargs):
-        ndsl_log.debug("Recv on rank %s with source %s", self._comm.Get_rank(), source)
         self._comm.Recv(recvbuf, source, tag=tag, **kwargs)
 
     def Irecv(self, recvbuf, source, tag: int = 0, **kwargs) -> Request:
-        ndsl_log.debug("Irecv on rank %s with source %s", self._comm.Get_rank(), source)
         return self._comm.Irecv(recvbuf, source, tag=tag, **kwargs)
 
     def Split(self, color, key) -> "Comm":
-        ndsl_log.debug(
-            "Split on rank %s with color %s, key %s", self._comm.Get_rank(), color, key
-        )
         return self._comm.Split(color, key)
 
     def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
-        ndsl_log.debug(
-            "allreduce on rank %s with operator %s", self._comm.Get_rank(), op
-        )
         return self._comm.allreduce(sendobj, self._op_mapping[op])
 
     def Allreduce(self, sendobj_or_inplace: T, recvobj: T, op: ReductionOperator) -> T:
-        ndsl_log.debug(
-            "Allreduce on rank %s with operator %s", self._comm.Get_rank(), op
-        )
         return self._comm.Allreduce(sendobj_or_inplace, recvobj, self._op_mapping[op])
 
     def Allreduce_inplace(self, recvobj: T, op: ReductionOperator) -> T:
-        ndsl_log.debug(
-            "Allreduce (in place) on rank %s with operator %s",
-            self._comm.Get_rank(),
-            op,
-        )
         return self._comm.Allreduce(mpi4py.MPI.IN_PLACE, recvobj, self._op_mapping[op])
diff --git a/ndsl/constants.py b/ndsl/constants.py
index b7e91b18..eb89bd17 100644
--- a/ndsl/constants.py
+++ b/ndsl/constants.py
@@ -78,86 +78,133 @@ class ConstantVersions(Enum):
 # Physical constants
 #####################
 if CONST_VERSION == ConstantVersions.GEOS:
-    RADIUS = Float(6.371e6)  # Radius of the Earth [m]
+    RADIUS = Float(6.371e6)
+    """Radius of the Earth [m]"""
     PI_8 = np.float64(3.14159265358979323846)
     PI = Float(PI_8)
-    OMEGA = Float(2.0) * PI / Float(86164.0)  # Rotation of the earth
-    GRAV = Float(9.80665)  # Acceleration due to gravity [m/s^2].04
-    RGRAV = Float(1.0) / GRAV  # Inverse of gravitational acceleration
-    RDGAS = Float(8314.47) / Float(
-        28.965
-    )  # Gas constant for dry air [J/kg/deg] ~287.04
-    RVGAS = Float(8314.47) / Float(18.015)  # Gas constant for water vapor [J/kg/deg]
-    HLV = Float(2.4665e6)  # Latent heat of evaporation [J/kg]
-    HLF = Float(3.3370e5)  # Latent heat of fusion [J/kg]  ~3.34e5
-    KAPPA = RDGAS / (Float(3.5) * RDGAS)  # Specific heat capacity of dry air at
+    OMEGA = Float(2.0) * PI / Float(86164.0)
+    """Rotation of the earth"""
+    GRAV = Float(9.80665)
+    """Acceleration due to gravity [m/s^2].04"""
+    RGRAV = Float(1.0) / GRAV
+    """Inverse of gravitational acceleration"""
+    RDGAS = Float(8314.47) / Float(28.965)
+    """Gas constant for dry air [J/kg/deg] ~287.04"""
+    RVGAS = Float(8314.47) / Float(18.015)
+    """Gas constant for water vapor [J/kg/deg]"""
+    HLV = Float(2.4665e6)
+    """Latent heat of evaporation [J/kg]"""
+    HLF = Float(3.3370e5)
+    """Latent heat of fusion [J/kg]  ~3.34e5"""
+    KAPPA = RDGAS / (Float(3.5) * RDGAS)
+    """Specific heat capacity of dry air at"""
     CP_AIR = RDGAS / KAPPA
-    TFREEZE = Float(273.16)  # Freezing temperature of fresh water [K]
+    TFREEZE = Float(273.16)
+    """Freezing temperature of fresh water [K]"""
     SAT_ADJUST_THRESHOLD = Float(1.0e-6)
+    DZ_MIN = Float(6.0)
 elif CONST_VERSION == ConstantVersions.UFS:
-    RADIUS = Float(6.3712e6)  # Radius of the Earth [m]
+    RADIUS = Float(6.3712e6)
+    """Radius of the Earth [m]"""
     PI = Float(3.1415926535897931)
-    OMEGA = Float(7.2921e-5)  # Rotation of the earth
-    GRAV = Float(9.80665)  # Acceleration due to gravity [m/s^2].04
-    RGRAV = Float(1.0 / GRAV)  # Inverse of gravitational acceleration
-    RDGAS = Float(287.05)  # Gas constant for dry air [J/kg/deg] ~287.04
-    RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
-    HLV = Float(2.5e6)  # Latent heat of evaporation [J/kg]
-    HLF = Float(3.3358e5)  # Latent heat of fusion [J/kg]  ~3.34e5
+    OMEGA = Float(7.2921e-5)
+    """Rotation of the earth"""
+    GRAV = Float(9.80665)
+    """Acceleration due to gravity [m/s^2].04"""
+    RGRAV = Float(1.0 / GRAV)
+    """Inverse of gravitational acceleration"""
+    RDGAS = Float(287.05)
+    """Gas constant for dry air [J/kg/deg] ~287.04"""
+    RVGAS = Float(461.50)
+    """Gas constant for water vapor [J/kg/deg]"""
+    HLV = Float(2.5e6)
+    """Latent heat of evaporation [J/kg]"""
+    HLF = Float(3.3358e5)
+    """Latent heat of fusion [J/kg]  ~3.34e5"""
     CP_AIR = Float(1004.6)
-    KAPPA = RDGAS / CP_AIR  # Specific heat capacity of dry air at
-    TFREEZE = Float(273.15)  # Freezing temperature of fresh water [K]
+    KAPPA = RDGAS / CP_AIR
+    """Specific heat capacity of dry air at"""
+    TFREEZE = Float(273.15)
+    """Freezing temperature of fresh water [K]"""
     SAT_ADJUST_THRESHOLD = Float(1.0e-8)
+    DZ_MIN = Float(2.0)
 elif CONST_VERSION == ConstantVersions.GFDL:
-    RADIUS = Float(6371.0e3)  # Radius of the Earth [m] #6371.0e3
-    PI = Float(3.14159265358979323846)  # 3.14159265358979323846
-    OMEGA = Float(7.292e-5)  # Rotation of the earth  # 7.292e-5
-    GRAV = Float(9.80)  # Acceleration due to gravity [m/s^2].04
-    RGRAV = Float(1.0) / GRAV  # Inverse of gravitational acceleration
-    RDGAS = Float(287.04)  # Gas constant for dry air [J/kg/deg] ~287.04
-    RVGAS = Float(461.50)  # Gas constant for water vapor [J/kg/deg]
-    HLV = Float(2.500e6)  # Latent heat of evaporation [J/kg]
-    HLF = Float(3.34e5)  # Latent heat of fusion [J/kg]  ~3.34e5
+    RADIUS = Float(6371.0e3)
+    """Radius of the Earth [m] #6371.0e3"""
+    PI = Float(3.14159265358979323846)
+    """3.14159265358979323846"""
+    OMEGA = Float(7.292e-5)
+    """Rotation of the earth  # 7.292e-5"""
+    GRAV = Float(9.80)
+    """Acceleration due to gravity [m/s^2].04"""
+    RGRAV = Float(1.0) / GRAV
+    """Inverse of gravitational acceleration"""
+    RDGAS = Float(287.04)
+    """Gas constant for dry air [J/kg/deg] ~287.04"""
+    RVGAS = Float(461.50)
+    """Gas constant for water vapor [J/kg/deg]"""
+    HLV = Float(2.500e6)
+    """Latent heat of evaporation [J/kg]"""
+    HLF = Float(3.34e5)
+    """Latent heat of fusion [J/kg]  ~3.34e5"""
     KAPPA = Float(2.0) / Float(7.0)
-    CP_AIR = RDGAS / KAPPA  # Specific heat capacity of dry air at
-    TFREEZE = Float(273.16)  # Freezing temperature of fresh water [K]
+    CP_AIR = RDGAS / KAPPA
+    """Specific heat capacity of dry air at"""
+    TFREEZE = Float(273.16)
+    """Freezing temperature of fresh water [K]"""
     SAT_ADJUST_THRESHOLD = Float(1.0e-8)
+    DZ_MIN = Float(2.0)
 else:
     raise RuntimeError("Constant selector failed, bad code.")
 
 SECONDS_PER_DAY = Float(86400.0)
-DZ_MIN = Float(2.0)
-CV_AIR = CP_AIR - RDGAS  # Heat capacity of dry air at constant volume
+SBC = 5.670400e-8
+"""Stefan-Boltzmann constant (W/m^2/K^4)"""
+CV_AIR = CP_AIR - RDGAS
+"""Heat capacity of dry air at constant volume"""
 RDG = -RDGAS / GRAV
-CNST_0P20 = Float(0.2)
 K1K = RDGAS / CV_AIR
-CNST_0P20 = Float(0.2)
-CV_VAP = Float(3.0) * RVGAS  # Heat capacity of water vapor at constant volume
-ZVIR = RVGAS / RDGAS - Float(1)  # con_fvirt in Fortran physics
-C_ICE = Float(1972.0)  # Heat capacity of ice at -15 degrees Celsius
-C_ICE_0 = Float(2106.0)  # Heat capacity of ice at 0 degrees Celsius
-C_LIQ = Float(4.1855e3)  # Heat capacity of water at 15 degrees Celsius
-CP_VAP = Float(4.0) * RVGAS  # Heat capacity of water vapor at constant pressure
-TICE = Float(273.16)  # Freezing temperature
-DC_ICE = C_LIQ - C_ICE  # Isobaric heating / cooling
-DC_VAP = CP_VAP - C_LIQ  # Isobaric heating / cooling
-D2ICE = DC_VAP + DC_ICE  # Isobaric heating / cooling
+CNST_0P20 = np.float64(0.2)
+CV_VAP = Float(3.0) * RVGAS
+"""Heat capacity of water vapor at constant volume"""
+ZVIR = RVGAS / RDGAS - Float(1)
+"""con_fvirt in Fortran physics"""
+C_ICE = Float(1972.0)
+"""Heat capacity of ice at -15 degrees Celsius"""
+C_ICE_0 = Float(2106.0)
+"""Heat capacity of ice at 0 degrees Celsius"""
+C_LIQ = Float(4.1855e3)
+"""Heat capacity of water at 15 degrees Celsius"""
+CP_VAP = Float(4.0) * RVGAS
+"""Heat capacity of water vapor at constant pressure"""
+TICE = Float(273.16)
+"""Freezing temperature"""
+DC_ICE = C_LIQ - C_ICE
+"""Isobaric heating / cooling"""
+DC_VAP = CP_VAP - C_LIQ
+"""Isobaric heating / cooling"""
+D2ICE = DC_VAP + DC_ICE
+"""Isobaric heating / cooling"""
 LI0 = HLF - DC_ICE * TICE
 EPS = RDGAS / RVGAS
 EPSM1 = EPS - Float(1.0)
-LV0 = (
-    HLV - DC_VAP * TICE
-)  # 3.13905782e6, evaporation latent heat coefficient at 0 degrees Kelvin
-LI00 = (
-    HLF - DC_ICE * TICE
-)  # -2.7105966e5, fusion latent heat coefficient at 0 degrees Kelvin
-LI2 = (
-    LV0 + LI00
-)  # 2.86799816e6, sublimation latent heat coefficient at 0 degrees Kelvin
-E00 = Float(611.21)  # Saturation vapor pressure at 0 degrees Celsius (Pa)
-PSAT = Float(610.78)  # Saturation vapor pressure at H2O 3pt (Pa)
-T_WFR = TICE - Float(40.0)  # homogeneous freezing temperature
+LV0 = HLV - DC_VAP * TICE
+"""3.13905782e6, evaporation latent heat coefficient at 0 degrees Kelvin"""
+LI00 = HLF - DC_ICE * TICE
+"""-2.7105966e5, fusion latent heat coefficient at 0 degrees Kelvin"""
+LI2 = LV0 + LI00
+"""2.86799816e6, sublimation latent heat coefficient at 0 degrees Kelvin"""
+E00 = Float(611.21)
+"""Saturation vapor pressure at 0 degrees Celsius (Pa)"""
+PSAT = Float(610.78)
+"""Saturation vapor pressure at H2O 3pt (Pa)"""
+T_WFR = TICE - Float(40.0)
+"""homogeneous freezing temperature"""
 TICE0 = TICE - Float(0.01)
-T_MIN = Float(178.0)  # Minimum temperature to freeze-dry all water vapor
+T_MIN = Float(178.0)
+"""Minimum temperature to freeze-dry all water vapor"""
 T_SAT_MIN = TICE - Float(160.0)
-LAT2 = np.power((HLV + HLF), 2, dtype=Float)  # used in bigg mechanism
+LAT2 = np.power((HLV + HLF), 2, dtype=Float)
+"""Used in bigg mechanism"""
+TTP = 2.7316e2
+"""Temperature of H2O triple point"""
diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py
index 5129dac8..27f17375 100644
--- a/ndsl/dsl/dace/dace_config.py
+++ b/ndsl/dsl/dace/dace_config.py
@@ -11,7 +11,7 @@
 from ndsl.dsl.caches.cache_location import identify_code_path
 from ndsl.dsl.caches.codepath import FV3CodePath
 from ndsl.dsl.gt4py_utils import is_gpu_backend
-from ndsl.dsl.typing import floating_point_precision
+from ndsl.dsl.typing import get_precision
 from ndsl.optional_imports import cupy as cp
 
 
@@ -264,7 +264,7 @@ def __init__(
                 "compiler", "cuda", "syncdebug", value=dace_debug_env_var
             )
 
-            if floating_point_precision() == 32:
+            if get_precision() == 32:
                 # When using 32-bit float, we flip the default dtypes to be all
                 # C, e.g. 32 bit.
                 dace.Config.set(
diff --git a/ndsl/dsl/dace/orchestration.py b/ndsl/dsl/dace/orchestration.py
index 767610c3..c09b69cc 100644
--- a/ndsl/dsl/dace/orchestration.py
+++ b/ndsl/dsl/dace/orchestration.py
@@ -438,7 +438,7 @@ def __get__(self, obj, objtype=None) -> SDFGEnabledCallable:
 def orchestrate(
     *,
     obj: object,
-    config: DaceConfig,
+    config: Optional[DaceConfig],
     method_to_orchestrate: str = "__call__",
     dace_compiletime_args: Optional[Sequence[str]] = None,
 ):
@@ -455,6 +455,9 @@ def orchestrate(
         dace_compiletime_args: list of names of arguments to be flagged has
                                dace.compiletime for orchestration to behave
     """
+    if config is None:
+        raise ValueError("DaCe config cannot be None")
+
     if dace_compiletime_args is None:
         dace_compiletime_args = []
 
diff --git a/ndsl/dsl/gt4py_utils.py b/ndsl/dsl/gt4py_utils.py
index 31c60ca7..6c0254e9 100644
--- a/ndsl/dsl/gt4py_utils.py
+++ b/ndsl/dsl/gt4py_utils.py
@@ -140,9 +140,7 @@ def make_storage_data(
                     default_mask = (True, True, False)
                     shape = (1, shape[axis])
                 else:
-                    default_mask = tuple(
-                        [i == axis for i in range(max_dim)]
-                    )  # type: ignore
+                    default_mask = tuple([i == axis for i in range(max_dim)])  # type: ignore
             elif dummy or axis != 2:
                 default_mask = (True, True, True)
             else:
@@ -151,16 +149,44 @@ def make_storage_data(
 
     if n_dims == 1:
         data = _make_storage_data_1d(
-            data, shape, start, dummy, axis, read_only, backend=backend
+            data,
+            shape,
+            start,
+            dummy,
+            axis,
+            read_only,
+            dtype=dtype,
+            backend=backend,
         )
     elif n_dims == 2:
         data = _make_storage_data_2d(
-            data, shape, start, dummy, axis, read_only, backend=backend
+            data,
+            shape,
+            start,
+            dummy,
+            axis,
+            read_only,
+            dtype=dtype,
+            backend=backend,
+        )
+    elif n_dims >= 4:
+        data = _make_storage_data_Nd(
+            data,
+            shape,
+            start,
+            dtype=dtype,
+            backend=backend,
         )
     elif n_dims >= 4:
         data = _make_storage_data_Nd(data, shape, start, backend=backend)
     else:
-        data = _make_storage_data_3d(data, shape, start, backend=backend)
+        data = _make_storage_data_3d(
+            data,
+            shape,
+            start,
+            dtype=dtype,
+            backend=backend,
+        )
 
     storage = gt4py.storage.from_array(
         data,
@@ -180,11 +206,12 @@ def _make_storage_data_1d(
     axis: int = 2,
     read_only: bool = True,
     *,
+    dtype: DTypes = Float,
     backend: str,
 ) -> Field:
     # axis refers to a repeated axis, dummy refers to a singleton axis
     axis = min(axis, len(shape) - 1)
-    buffer = zeros(shape[axis], backend=backend)
+    buffer = zeros(shape[axis], dtype=dtype, backend=backend)
     if dummy:
         axis = list(set((0, 1, 2)).difference(dummy))[0]
 
@@ -216,6 +243,7 @@ def _make_storage_data_2d(
     axis: int = 2,
     read_only: bool = True,
     *,
+    dtype: DTypes = Float,
     backend: str,
 ) -> Field:
     # axis refers to which axis should be repeated (when making a full 3d data),
@@ -229,7 +257,7 @@ def _make_storage_data_2d(
 
     start1, start2 = start[0:2]
     size1, size2 = data.shape
-    buffer = zeros(shape2d, backend=backend)
+    buffer = zeros(shape2d, dtype=dtype, backend=backend)
     buffer[start1 : start1 + size1, start2 : start2 + size2] = asarray(
         data, type(buffer)
     )
@@ -249,11 +277,12 @@ def _make_storage_data_3d(
     shape: Tuple[int, ...],
     start: Tuple[int, ...] = (0, 0, 0),
     *,
+    dtype: DTypes = Float,
     backend: str,
 ) -> Field:
     istart, jstart, kstart = start
     isize, jsize, ksize = data.shape
-    buffer = zeros(shape, backend=backend)
+    buffer = zeros(shape, dtype=dtype, backend=backend)
     buffer[
         istart : istart + isize,
         jstart : jstart + jsize,
@@ -267,11 +296,12 @@ def _make_storage_data_Nd(
     shape: Tuple[int, ...],
     start: Tuple[int, ...] = None,
     *,
+    dtype: DTypes = Float,
     backend: str,
 ) -> Field:
     if start is None:
         start = tuple([0] * data.ndim)
-    buffer = zeros(shape, backend=backend)
+    buffer = zeros(shape, dtype=dtype, backend=backend)
     idx = tuple([slice(start[i], start[i] + data.shape[i]) for i in range(len(start))])
     buffer[idx] = asarray(data, type(buffer))
     return buffer
diff --git a/ndsl/dsl/stencil.py b/ndsl/dsl/stencil.py
index 5e917e66..daf78091 100644
--- a/ndsl/dsl/stencil.py
+++ b/ndsl/dsl/stencil.py
@@ -31,6 +31,7 @@
 from ndsl.dsl.stencil_config import CompilationConfig, RunMode, StencilConfig
 from ndsl.dsl.typing import Float, Index3D, cast_to_index3d
 from ndsl.initialization.sizer import GridSizer, SubtileGridSizer
+from ndsl.logging import ndsl_log
 from ndsl.quantity import Quantity
 from ndsl.testing.comparison import LegacyMetric
 
@@ -374,6 +375,8 @@ def nothing_function(*args, **kwargs):
             setattr(self, "__call__", nothing_function)
 
     def __call__(self, *args, **kwargs) -> None:
+        if self.stencil_config.verbose:
+            ndsl_log.debug(f"Running {self._func_name}")
         args_list = list(args)
         _convert_quantities_to_storage(args_list, kwargs)
         args = tuple(args_list)
diff --git a/ndsl/dsl/stencil_config.py b/ndsl/dsl/stencil_config.py
index 6b8f75eb..4d3eafab 100644
--- a/ndsl/dsl/stencil_config.py
+++ b/ndsl/dsl/stencil_config.py
@@ -169,6 +169,7 @@ class StencilConfig(Hashable):
     compare_to_numpy: bool = False
     compilation_config: CompilationConfig = CompilationConfig()
     dace_config: Optional[DaceConfig] = None
+    verbose: bool = False
 
     def __post_init__(self):
         self.backend_opts = {
diff --git a/ndsl/dsl/typing.py b/ndsl/dsl/typing.py
index b3fa72d8..1cae1063 100644
--- a/ndsl/dsl/typing.py
+++ b/ndsl/dsl/typing.py
@@ -1,5 +1,5 @@
 import os
-from typing import Tuple, Union, cast
+from typing import Tuple, TypeAlias, Union, cast
 
 import gt4py.cartesian.gtscript as gtscript
 import numpy as np
@@ -22,35 +22,41 @@
 DTypes = Union[bool, np.bool_, int, np.int32, np.int64, float, np.float32, np.float64]
 
 
+# Depreciated version of get_precision, but retained for a PACE dependency
 def floating_point_precision() -> int:
     return int(os.getenv("PACE_FLOAT_PRECISION", "64"))
 
 
+def get_precision() -> int:
+    return int(os.getenv("PACE_FLOAT_PRECISION", "64"))
+
+
 # We redefine the type as a way to distinguish
 # the model definition of a float to other usage of the
 # common numpy type in the rest of the code.
-NDSL_32BIT_FLOAT_TYPE = np.float32
-NDSL_64BIT_FLOAT_TYPE = np.float64
+NDSL_32BIT_FLOAT_TYPE: TypeAlias = np.float32
+NDSL_64BIT_FLOAT_TYPE: TypeAlias = np.float64
+NDSL_32BIT_INT_TYPE: TypeAlias = np.int32
+NDSL_64BIT_INT_TYPE: TypeAlias = np.int64
 
 
-def global_set_floating_point_precision():
-    """Set the global floating point precision for all reference
-    to Float in the codebase. Defaults to 64 bit."""
-    global Float
-    precision_in_bit = floating_point_precision()
+def global_set_precision() -> Tuple[TypeAlias, TypeAlias]:
+    """Set the global precision for all references of
+    Float and Int in the codebase. Defaults to 64 bit."""
+    global Float, Int
+    precision_in_bit = get_precision()
     if precision_in_bit == 64:
-        return NDSL_64BIT_FLOAT_TYPE
+        return NDSL_64BIT_FLOAT_TYPE, NDSL_64BIT_INT_TYPE
     elif precision_in_bit == 32:
-        return NDSL_32BIT_FLOAT_TYPE
+        return NDSL_32BIT_FLOAT_TYPE, NDSL_32BIT_INT_TYPE
     else:
-        NotImplementedError(
+        raise NotImplementedError(
             f"{precision_in_bit} bit precision not implemented or tested"
         )
 
 
 # Default float and int types
-Float = global_set_floating_point_precision()
-Int = np.int_
+Float, Int = global_set_precision()
 Bool = np.bool_
 
 FloatField = Field[gtscript.IJK, Float]
@@ -68,10 +74,27 @@ def global_set_floating_point_precision():
 FloatFieldK = Field[gtscript.K, Float]
 FloatFieldK64 = Field[gtscript.K, np.float64]
 FloatFieldK32 = Field[gtscript.K, np.float32]
+
 IntField = Field[gtscript.IJK, Int]
+IntField64 = Field[gtscript.IJK, np.int64]
+IntField32 = Field[gtscript.IJK, np.int32]
+IntFieldI = Field[gtscript.I, Int]
+IntFieldI64 = Field[gtscript.I, np.int64]
+IntFieldI32 = Field[gtscript.I, np.int32]
+IntFieldJ = Field[gtscript.J, Int]
+IntFieldJ64 = Field[gtscript.J, np.int64]
+IntFieldJ32 = Field[gtscript.J, np.int32]
 IntFieldIJ = Field[gtscript.IJ, Int]
+IntFieldIJ64 = Field[gtscript.IJ, np.int64]
+IntFieldIJ32 = Field[gtscript.IJ, np.int32]
 IntFieldK = Field[gtscript.K, Int]
+IntFieldK64 = Field[gtscript.K, np.int64]
+IntFieldK32 = Field[gtscript.K, np.int32]
+
 BoolField = Field[gtscript.IJK, Bool]
+BoolFieldI = Field[gtscript.I, Bool]
+BoolFieldJ = Field[gtscript.J, Bool]
+BoolFieldK = Field[gtscript.K, Bool]
 BoolFieldIJ = Field[gtscript.IJ, Bool]
 
 Index3D = Tuple[int, int, int]
diff --git a/ndsl/grid/generation.py b/ndsl/grid/generation.py
index f77e2cd2..c32ceb3f 100644
--- a/ndsl/grid/generation.py
+++ b/ndsl/grid/generation.py
@@ -298,7 +298,7 @@ def __init__(
         self._dy_center = None
         self._area = None
         self._area_c = None
-        if eta_file is not None:
+        if eta_file is not None or ak is not None or bk is not None:
             (
                 self._ks,
                 self._ptop,
@@ -3331,12 +3331,12 @@ def _calculate_edge_factors(self):
             self._np,
         )
 
-        edge_w = quantity_cast_to_model_float(self.quantity_factory, edge_w_64)
-        edge_e = quantity_cast_to_model_float(self.quantity_factory, edge_e_64)
-        edge_s = quantity_cast_to_model_float(self.quantity_factory, edge_s_64)
-        edge_n = quantity_cast_to_model_float(self.quantity_factory, edge_n_64)
-
-        return edge_w, edge_e, edge_s, edge_n
+        return (
+            edge_w_64,
+            edge_e_64,
+            edge_s_64,
+            edge_n_64,
+        )
 
     def _calculate_edge_a2c_vect_factors(self):
         edge_vect_s_64 = self.quantity_factory.zeros(
diff --git a/ndsl/grid/helper.py b/ndsl/grid/helper.py
index 6cb6a374..1a82d053 100644
--- a/ndsl/grid/helper.py
+++ b/ndsl/grid/helper.py
@@ -332,13 +332,21 @@ def __init__(
         vertical_data: VerticalGridData,
         contravariant_data: ContravariantGridData,
         angle_data: AngleGridData,
+        fc=None,
+        fc_agrid=None,
     ):
         self._horizontal_data = horizontal_data
         self._vertical_data = vertical_data
         self._contravariant_data = contravariant_data
         self._angle_data = angle_data
-        self._fC = None
-        self._fC_agrid = None
+        if fc is not None:
+            self._fC = GridData._fC_from_data(fc, horizontal_data.lat)
+        else:
+            self._fC = None
+        if fc_agrid is not None:
+            self._fC_agrid = GridData._fC_from_data(fc_agrid, horizontal_data.lat)
+        else:
+            self._fC_agrid = None
 
     @classmethod
     def new_from_metric_terms(cls, metric_terms: MetricTerms):
@@ -369,9 +377,7 @@ def lat_agrid(self) -> Quantity:
         return self._horizontal_data.lat_agrid
 
     @staticmethod
-    def _fC_from_lat(lat: Quantity) -> Quantity:
-        np = lat.np
-        data = 2.0 * constants.OMEGA * np.sin(lat.data)
+    def _fC_from_data(data, lat: Quantity) -> Quantity:
         return Quantity(
             data,
             units="1/s",
@@ -381,6 +387,12 @@ def _fC_from_lat(lat: Quantity) -> Quantity:
             gt4py_backend=lat.gt4py_backend,
         )
 
+    @staticmethod
+    def _fC_from_lat(lat: Quantity) -> Quantity:
+        np = lat.np
+        data = Float(2.0) * constants.OMEGA * np.sin(lat.data, dtype=Float)
+        return GridData._fC_from_data(data, lat)
+
     @property
     def fC(self):
         """Coriolis parameter at cell corners"""
diff --git a/ndsl/initialization/sizer.py b/ndsl/initialization/sizer.py
index d42acf77..8ad3d196 100644
--- a/ndsl/initialization/sizer.py
+++ b/ndsl/initialization/sizer.py
@@ -66,6 +66,17 @@ def from_tile_params(
         )
         nx = x_slice.stop - x_slice.start
         ny = y_slice.stop - y_slice.start
+
+        # TODO: Remove after vector halo update issue resolved
+        if nx <= n_halo:
+            raise Exception(
+                "SubtileGridSizer::from_tile_params: Compute domain extent must be greater than halo size"
+            )
+        if ny <= n_halo:
+            raise Exception(
+                "SubtileGridSizer::from_tile_params: Compute domain extent must be greater than halo size"
+            )
+
         return cls(nx, ny, nz, n_halo, extra_dim_lengths)
 
     @classmethod
diff --git a/ndsl/logging.py b/ndsl/logging.py
index 44cdb690..73b7979c 100644
--- a/ndsl/logging.py
+++ b/ndsl/logging.py
@@ -1,6 +1,9 @@
+from __future__ import annotations
+
 import logging
 import os
 import sys
+from typing import Annotated
 
 from mpi4py import MPI
 
@@ -18,7 +21,7 @@
 }
 
 
-def _ndsl_logger():
+def _ndsl_logger() -> logging.Logger:
     name_log = logging.getLogger(__name__)
     name_log.setLevel(LOGLEVEL)
 
@@ -36,4 +39,33 @@ def _ndsl_logger():
     return name_log
 
 
-ndsl_log = _ndsl_logger()
+def _ndsl_logger_on_rank_0() -> logging.Logger:
+    name_log = logging.getLogger(f"{__name__}_on_rank_0")
+    name_log.setLevel(LOGLEVEL)
+
+    rank = MPI.COMM_WORLD.Get_rank()
+
+    if rank == 0:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(LOGLEVEL)
+        formatter = logging.Formatter(
+            fmt=(
+                f"%(asctime)s|%(levelname)s|rank {MPI.COMM_WORLD.Get_rank()}|"
+                "%(name)s:%(message)s"
+            ),
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+        handler.setFormatter(formatter)
+        name_log.addHandler(handler)
+    else:
+        name_log.disabled = True
+    return name_log
+
+
+ndsl_log: Annotated[
+    logging.Logger, "NDSL Python logger, logs on all rank"
+] = _ndsl_logger()
+
+ndsl_log_on_rank_0: Annotated[
+    logging.Logger, "NDSL Python logger, logs on rank 0 only"
+] = _ndsl_logger_on_rank_0()
diff --git a/ndsl/monitor/netcdf_monitor.py b/ndsl/monitor/netcdf_monitor.py
index 8a0b96fd..204d7b94 100644
--- a/ndsl/monitor/netcdf_monitor.py
+++ b/ndsl/monitor/netcdf_monitor.py
@@ -1,11 +1,13 @@
 import os
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set
+from warnings import warn
 
 import fsspec
 import numpy as np
 
 from ndsl.comm.communicator import Communicator
+from ndsl.dsl.typing import Float, get_precision
 from ndsl.filesystem import get_fs
 from ndsl.logging import ndsl_log
 from ndsl.monitor.convert import to_numpy
@@ -114,6 +116,7 @@ def __init__(
         path: str,
         communicator: Communicator,
         time_chunk_size: int = 1,
+        precision=Float,
     ):
         """Create a NetCDFMonitor.
 
@@ -130,6 +133,11 @@ def __init__(
         self._time_chunk_size = time_chunk_size
         self.__writer: Optional[_ChunkedNetCDFWriter] = None
         self._expected_vars: Optional[Set[str]] = None
+        self._transfer_type = precision
+        if self._transfer_type == np.float32 and get_precision() > 32:
+            warn(
+                f"NetCDF save: requested 32-bit float but precision of NDSL is {get_precision()}, cast will occur with possible loss of precision"
+            )
 
     @property
     def _writer(self):
@@ -164,12 +172,16 @@ def store(self, state: dict) -> None:
                     set(state.keys()), self._expected_vars
                 )
             )
-        state = self._communicator.tile.gather_state(state, transfer_type=np.float32)
+        state = self._communicator.tile.gather_state(
+            state, transfer_type=self._transfer_type
+        )
         if state is not None:  # we are on root rank
             self._writer.append(state)
 
     def store_constant(self, state: Dict[str, Quantity]) -> None:
-        state = self._communicator.gather_state(state, transfer_type=np.float32)
+        state = self._communicator.gather_state(
+            state, transfer_type=self._transfer_type
+        )
         if state is not None:  # we are on root rank
             constants_filename = str(
                 Path(self._path) / NetCDFMonitor._CONSTANT_FILENAME
diff --git a/ndsl/namelist.py b/ndsl/namelist.py
index 304d9160..8df5c207 100644
--- a/ndsl/namelist.py
+++ b/ndsl/namelist.py
@@ -490,6 +490,8 @@ class Namelist:
     nf_omega: int = NamelistDefaults.nf_omega
     fv_sg_adj: int = NamelistDefaults.fv_sg_adj
     n_sponge: int = NamelistDefaults.n_sponge
+    daily_mean: bool = False
+    """Flag to replace cosz with daily mean value in physics"""
 
     @classmethod
     def from_f90nml(cls, namelist: f90nml.Namelist):
diff --git a/ndsl/quantity/quantity.py b/ndsl/quantity/quantity.py
index c88ba140..4f80fff1 100644
--- a/ndsl/quantity/quantity.py
+++ b/ndsl/quantity/quantity.py
@@ -3,6 +3,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+from mpi4py import MPI
 
 import ndsl.constants as constants
 from ndsl.dsl.typing import Float, is_float
@@ -152,6 +153,10 @@ def from_data_array(
             gt4py_backend=gt4py_backend,
         )
 
+    def to_netcdf(self, path: str, name="var", rank: int = -1) -> None:
+        if rank < 0 or MPI.COMM_WORLD.Get_rank() == rank:
+            self.data_array.to_dataset(name=name).to_netcdf(f"{path}__r{rank}.nc4")
+
     def halo_spec(self, n_halo: int) -> QuantityHaloSpec:
         return QuantityHaloSpec(
             n_halo,
@@ -255,8 +260,16 @@ def extent(self) -> Tuple[int, ...]:
         return self.metadata.extent
 
     @property
-    def data_array(self) -> xr.DataArray:
-        return xr.DataArray(self.view[:], dims=self.dims, attrs=self.attrs)
+    def data_array(self, full_data=False) -> xr.DataArray:
+        """Returns an Xarray.DataArray of the view (domain)
+
+        Args:
+            full_data: Return the entire data (halo included) instead of the view
+        """
+        if full_data:
+            return xr.DataArray(self.data[:], dims=self.dims, attrs=self.attrs)
+        else:
+            return xr.DataArray(self.view[:], dims=self.dims, attrs=self.attrs)
 
     @property
     def np(self) -> NumpyModule:
diff --git a/ndsl/stencils/testing/conftest.py b/ndsl/stencils/testing/conftest.py
index 2ed22fee..6da880fd 100644
--- a/ndsl/stencils/testing/conftest.py
+++ b/ndsl/stencils/testing/conftest.py
@@ -91,6 +91,18 @@ def pytest_addoption(parser):
         default=False,
         help="Use the multi-modal float metric. Default to False.",
     )
+    parser.addoption(
+        "--sort_report",
+        action="store",
+        default="ulp",
+        help='Sort the report by "index" (ascending) or along the metric: "ulp", "absolute", "relative" (descending). Default to "ulp"',
+    )
+    parser.addoption(
+        "--no_report",
+        action="store_true",
+        default=False,
+        help="Do not generate logging report or NetCDF in .translate-errors",
+    )
 
 
 def pytest_configure(config):
@@ -237,6 +249,8 @@ def sequential_savepoint_cases(metafunc, data_path, namelist_filename, *, backen
     savepoint_to_replay = get_savepoint_restriction(metafunc)
     grid_mode = metafunc.config.getoption("grid")
     topology_mode = metafunc.config.getoption("topology")
+    sort_report = metafunc.config.getoption("sort_report")
+    no_report = metafunc.config.getoption("no_report")
     return _savepoint_cases(
         savepoint_names,
         ranks,
@@ -247,6 +261,8 @@ def sequential_savepoint_cases(metafunc, data_path, namelist_filename, *, backen
         data_path,
         grid_mode,
         topology_mode,
+        sort_report=sort_report,
+        no_report=no_report,
     )
 
 
@@ -260,6 +276,8 @@ def _savepoint_cases(
     data_path: str,
     grid_mode: str,
     topology_mode: bool,
+    sort_report: str,
+    no_report: bool,
 ):
     return_list = []
     for rank in ranks:
@@ -309,10 +327,11 @@ def _savepoint_cases(
                     SavepointCase(
                         savepoint_name=test_name,
                         data_dir=data_path,
-                        rank=rank,
                         i_call=i_call,
                         testobj=testobj,
                         grid=grid,
+                        sort_report=sort_report,
+                        no_report=no_report,
                     )
                 )
     return return_list
@@ -333,6 +352,8 @@ def parallel_savepoint_cases(
 ):
     namelist = get_namelist(namelist_filename)
     topology_mode = metafunc.config.getoption("topology")
+    sort_report = metafunc.config.getoption("sort_report")
+    no_report = metafunc.config.getoption("no_report")
     communicator = get_communicator(comm, namelist.layout, topology_mode)
     stencil_config = get_config(backend, communicator)
     savepoint_names = get_parallel_savepoint_names(metafunc, data_path)
@@ -348,6 +369,8 @@ def parallel_savepoint_cases(
         data_path,
         grid_mode,
         topology_mode,
+        sort_report=sort_report,
+        no_report=no_report,
     )
 
 
diff --git a/ndsl/stencils/testing/grid.py b/ndsl/stencils/testing/grid.py
index fbee10a5..6d4b4b38 100644
--- a/ndsl/stencils/testing/grid.py
+++ b/ndsl/stencils/testing/grid.py
@@ -7,7 +7,6 @@
 from ndsl.constants import N_HALO_DEFAULT, X_DIM, Y_DIM, Z_DIM
 from ndsl.dsl import gt4py_utils as utils
 from ndsl.dsl.stencil import GridIndexing
-from ndsl.dsl.typing import Float
 from ndsl.grid.generation import GridDefinitions
 from ndsl.grid.helper import (
     AngleGridData,
@@ -505,7 +504,12 @@ def grid_data(self) -> "GridData":
             data = getattr(self, name)
             assert data is not None
 
-            quantity = self.quantity_factory.zeros(dims=dims, units=units, dtype=Float)
+            quantity = self.quantity_factory.zeros(
+                dims=dims,
+                units=units,
+                dtype=data.dtype,
+                allow_mismatch_float_precision=True,
+            )
             if len(quantity.shape) == 3:
                 quantity.data[:] = data[:, :, : quantity.shape[2]]
             elif len(quantity.shape) == 2:
@@ -549,6 +553,7 @@ def grid_data(self) -> "GridData":
                 data=self.area_64,
                 dims=GridDefinitions.area.dims,
                 units=GridDefinitions.area.units,
+                allow_mismatch_float_precision=True,
             ),
             rarea=self.quantity_factory.from_array(
                 data=self.rarea,
@@ -810,6 +815,8 @@ def grid_data(self) -> "GridData":
             vertical_data=vertical,
             contravariant_data=contravariant,
             angle_data=angle,
+            fc=self.fC,
+            fc_agrid=self.f0,
         )
         return self._grid_data
 
diff --git a/ndsl/stencils/testing/parallel_translate.py b/ndsl/stencils/testing/parallel_translate.py
index e0669994..7df16a17 100644
--- a/ndsl/stencils/testing/parallel_translate.py
+++ b/ndsl/stencils/testing/parallel_translate.py
@@ -17,6 +17,9 @@
 class ParallelTranslate:
     max_error = TranslateFortranData2Py.max_error
     near_zero = TranslateFortranData2Py.near_zero
+    mmr_absolute_eps = TranslateFortranData2Py.mmr_absolute_eps
+    mmr_relative_fraction = TranslateFortranData2Py.mmr_relative_fraction
+    mmr_ulp = TranslateFortranData2Py.mmr_ulp
     compute_grid_option = False
     tests_grid = False
     inputs: Dict[str, Any] = {}
diff --git a/ndsl/stencils/testing/savepoint.py b/ndsl/stencils/testing/savepoint.py
index 7571befb..2708011e 100644
--- a/ndsl/stencils/testing/savepoint.py
+++ b/ndsl/stencils/testing/savepoint.py
@@ -16,11 +16,29 @@ def dataset_to_dict(ds: xr.Dataset) -> Dict[str, Union[np.ndarray, float, int]]:
 
 def _process_if_scalar(value: np.ndarray) -> Union[np.ndarray, float, int]:
     if len(value.shape) == 0:
-        return value.item()
+        return value.max()  # trick to make sure we get the right type back
     else:
         return value
 
 
+class DataLoader:
+    def __init__(self, rank: int, data_path: str):
+        self._data_path = data_path
+        self._rank = rank
+
+    def load(
+        self,
+        name: str,
+        postfix: str = "",
+        i_call: int = 0,
+    ) -> Dict[str, Union[np.ndarray, float, int]]:
+        return dataset_to_dict(
+            xr.open_dataset(os.path.join(self._data_path, f"{name}{postfix}.nc"))
+            .isel(rank=self._rank)
+            .isel(savepoint=i_call)
+        )
+
+
 class Translate(Protocol):
     def collect_input_data(self, ds: xr.Dataset) -> dict:
         ...
@@ -28,6 +46,9 @@ def collect_input_data(self, ds: xr.Dataset) -> dict:
     def compute(self, data: dict):
         ...
 
+    def extra_data_load(self, data_loader: DataLoader):
+        ...
+
 
 @dataclasses.dataclass
 class SavepointCase:
@@ -37,13 +58,14 @@ class SavepointCase:
 
     savepoint_name: str
     data_dir: str
-    rank: int
     i_call: int
     testobj: Translate
     grid: Grid
+    sort_report: str
+    no_report: bool
 
     def __str__(self):
-        return f"{self.savepoint_name}-rank={self.rank}-call={self.i_call}"
+        return f"{self.savepoint_name}-rank={self.grid.rank}-call={self.i_call}"
 
     @property
     def exists(self) -> bool:
@@ -51,14 +73,14 @@ def exists(self) -> bool:
             xr.open_dataset(
                 os.path.join(self.data_dir, f"{self.savepoint_name}-In.nc")
             ).sizes["rank"]
-            > self.rank
+            > self.grid.rank
         )
 
     @property
     def ds_in(self) -> xr.Dataset:
         return (
             xr.open_dataset(os.path.join(self.data_dir, f"{self.savepoint_name}-In.nc"))
-            .isel(rank=self.rank)
+            .isel(rank=self.grid.rank)
             .isel(savepoint=self.i_call)
         )
 
@@ -68,6 +90,6 @@ def ds_out(self) -> xr.Dataset:
             xr.open_dataset(
                 os.path.join(self.data_dir, f"{self.savepoint_name}-Out.nc")
             )
-            .isel(rank=self.rank)
+            .isel(rank=self.grid.rank)
             .isel(savepoint=self.i_call)
         )
diff --git a/ndsl/stencils/testing/serialbox_to_netcdf.py b/ndsl/stencils/testing/serialbox_to_netcdf.py
index 46996dbe..f514ae0c 100644
--- a/ndsl/stencils/testing/serialbox_to_netcdf.py
+++ b/ndsl/stencils/testing/serialbox_to_netcdf.py
@@ -140,7 +140,8 @@ def main(
         data_vars = {}
         if n_savepoints > 0:
             encoding = {}
-            for varname in set(names_list).difference(["rank"]):
+            names_indices = np.sort(list(set(names_list).difference(["rank"])))
+            for varname in names_indices:
                 # Check that all ranks have the same size. If not, aggregate and
                 # feedback on one rank
                 collapse_all_ranks = False
diff --git a/ndsl/stencils/testing/test_translate.py b/ndsl/stencils/testing/test_translate.py
index 70480c16..5b9bc773 100644
--- a/ndsl/stencils/testing/test_translate.py
+++ b/ndsl/stencils/testing/test_translate.py
@@ -14,7 +14,7 @@
 from ndsl.dsl.stencil import CompilationConfig, StencilConfig
 from ndsl.quantity import Quantity
 from ndsl.restart._legacy_restart import RESTART_PROPERTIES
-from ndsl.stencils.testing.savepoint import SavepointCase, dataset_to_dict
+from ndsl.stencils.testing.savepoint import DataLoader, SavepointCase, dataset_to_dict
 from ndsl.testing.comparison import BaseMetric, LegacyMetric, MultiModalFloatMetric
 from ndsl.testing.perturbation import perturb
 
@@ -178,7 +178,7 @@ def test_sequential_savepoint(
     if case.testobj.skip_test:
         return
     if not case.exists:
-        pytest.skip(f"Data at rank {case.rank} does not exists")
+        pytest.skip(f"Data at rank {case.grid.rank} does not exists")
     input_data = dataset_to_dict(case.ds_in)
     input_names = (
         case.testobj.serialnames(case.testobj.in_vars["data_vars"])
@@ -191,6 +191,9 @@ def test_sequential_savepoint(
             f"Variable {e} was described in the translate test but cannot be found in the NetCDF"
         )
     original_input_data = copy.deepcopy(input_data)
+    # give the user a chance to load data from other savepoints to allow
+    # for gathering required data from multiple sources (constants, etc.)
+    case.testobj.extra_data_load(DataLoader(case.grid.rank, case.data_dir))
     # run python version of functionality
     output = case.testobj.compute(input_data)
     failing_names: List[str] = []
@@ -205,7 +208,7 @@ def test_sequential_savepoint(
         try:
             ref_data = all_ref_data[varname]
         except KeyError:
-            raise KeyError(f'Output "{varname}" couldn\'t be found in output data')
+            raise KeyError(f"Output {varname} couldn't be found in output data")
         if hasattr(case.testobj, "subset_output"):
             ref_data = case.testobj.subset_output(varname, ref_data)
         with subtests.test(varname=varname):
@@ -218,6 +221,7 @@ def test_sequential_savepoint(
                     absolute_eps_override=case.testobj.mmr_absolute_eps,
                     relative_fraction_override=case.testobj.mmr_relative_fraction,
                     ulp_override=case.testobj.mmr_ulp,
+                    sort_report=case.sort_report,
                 )
             else:
                 metric = LegacyMetric(
@@ -234,8 +238,9 @@ def test_sequential_savepoint(
         ref_data_out[varname] = [ref_data]
 
     # Reporting & data save
-    _report_results(case.savepoint_name, results)
-    if len(failing_names) > 0:
+    if not case.no_report:
+        _report_results(case.savepoint_name, case.grid.rank, results)
+    if len(failing_names) > 0 and not case.no_report:
         get_thresholds(case.testobj, input_data=original_input_data)
         os.makedirs(OUTDIR, exist_ok=True)
         nc_filename = os.path.join(OUTDIR, f"translate-{case.savepoint_name}.nc")
@@ -248,6 +253,7 @@ def test_sequential_savepoint(
             [output],
             ref_data_out,
             failing_names,
+            passing_names,
             nc_filename,
         )
     if failing_names != []:
@@ -341,7 +347,7 @@ def test_parallel_savepoint(
     if (grid == "compute") and not case.testobj.compute_grid_option:
         pytest.xfail(f"Grid compute option not used for test {case.savepoint_name}")
     if not case.exists:
-        pytest.skip(f"Data at rank {case.rank} does not exists")
+        pytest.skip(f"Data at rank {case.grid.rank} does not exists")
     input_data = dataset_to_dict(case.ds_in)
     # run python version of functionality
     output = case.testobj.compute_parallel(input_data, communicator)
@@ -368,9 +374,12 @@ def test_parallel_savepoint(
                 metric = MultiModalFloatMetric(
                     reference_values=ref_data[varname][0],
                     computed_values=output_data,
-                    eps=case.testobj.max_error,
+                    absolute_eps_override=case.testobj.mmr_absolute_eps,
+                    relative_fraction_override=case.testobj.mmr_relative_fraction,
+                    ulp_override=case.testobj.mmr_ulp,
                     ignore_near_zero_errors=ignore_near_zero,
                     near_zero=case.testobj.near_zero,
+                    sort_report=case.sort_report,
                 )
             else:
                 metric = LegacyMetric(
@@ -386,7 +395,7 @@ def test_parallel_savepoint(
             passing_names.append(failing_names.pop())
 
     # Reporting & data save
-    _report_results(case.savepoint_name, results)
+    _report_results(case.savepoint_name, case.grid.rank, results)
     if len(failing_names) > 0:
         os.makedirs(OUTDIR, exist_ok=True)
         nct_filename = os.path.join(
@@ -398,14 +407,15 @@ def test_parallel_savepoint(
                 input_data_on_host[key] = gt_utils.asarray(_input)
             save_netcdf(
                 case.testobj,
-                [input_data_on_host],
-                [output],
-                ref_data,
-                failing_names,
-                nct_filename,
+                inputs_list=[input_data_on_host],
+                output_list=[output],
+                ref_data=ref_data,
+                failing_names=failing_names,
+                passing_names=passing_names,
+                out_filename=nct_filename,
             )
         except Exception as error:
-            print(f"TestParallel SaveNetCDF Error: {error}")
+            print(f"TestParallel SaveNetCDF Error at rank {case.grid.rank}: {error}")
     if failing_names != []:
         pytest.fail(
             f"Only the following variables passed: {passing_names}", pytrace=False
@@ -414,36 +424,47 @@ def test_parallel_savepoint(
         pytest.fail("No tests passed")
 
 
-def _report_results(savepoint_name: str, results: Dict[str, BaseMetric]) -> None:
-    os.makedirs(OUTDIR, exist_ok=True)
+def _report_results(
+    savepoint_name: str,
+    rank: int,
+    results: Dict[str, BaseMetric],
+) -> None:
+    detail_dir = f"{OUTDIR}/details"
+    os.makedirs(detail_dir, exist_ok=True)
 
     # Summary
-    with open(f"{OUTDIR}/summary-{savepoint_name}.log", "w") as f:
+    with open(f"{OUTDIR}/summary-{savepoint_name}-{rank}.log", "w") as f:
         for varname, metric in results.items():
             f.write(f"{varname}: {metric.one_line_report()}\n")
 
     # Detailed log
     for varname, metric in results.items():
-        log_filename = os.path.join(OUTDIR, f"details-{savepoint_name}-{varname}.log")
+        log_filename = os.path.join(
+            detail_dir, f"{savepoint_name}-{varname}-{rank}.log"
+        )
         metric.report(log_filename)
 
 
-def save_netcdf(
+def _save_datatree(
     testobj,
     # first list over rank, second list over savepoint
     inputs_list: List[Dict[str, List[np.ndarray]]],
     output_list: List[Dict[str, List[np.ndarray]]],
     ref_data: Dict[str, List[np.ndarray]],
-    failing_names,
-    out_filename,
+    names: List[str],
 ):
     import xarray as xr
 
-    data_vars = {}
-    for i, varname in enumerate(failing_names):
+    datasets = {}
+    indices = np.argsort(names)
+    for index in indices:
+        data_vars = {}
+        varname = names[index]
         # Read in dimensions and attributes
-        if hasattr(testobj, "outputs"):
-            dims = [dim_name + f"_{i}" for dim_name in testobj.outputs[varname]["dims"]]
+        if hasattr(testobj, "outputs") and testobj.outputs != {}:
+            dims = [
+                dim_name + f"_{index}" for dim_name in testobj.outputs[varname]["dims"]
+            ]
             attrs = {"units": testobj.outputs[varname]["units"]}
         else:
             dims = [
@@ -477,6 +498,37 @@ def save_netcdf(
         )
         data_vars[f"{varname}_absolute_error"] = absolute_errors
         data_vars[f"{varname}_absolute_error"].attrs = attrs
+        datasets[varname] = xr.Dataset(data_vars=data_vars)
+
+    return xr.DataTree.from_dict(datasets)
+
 
+def save_netcdf(
+    testobj,
+    # first list over rank, second list over savepoint
+    inputs_list: List[Dict[str, List[np.ndarray]]],
+    output_list: List[Dict[str, List[np.ndarray]]],
+    ref_data: Dict[str, List[np.ndarray]],
+    failing_names: List[str],
+    passing_names: List[str],
+    out_filename,
+):
+    import xarray as xr
+
+    datasets = {}
+    datasets["Fail"] = _save_datatree(
+        testobj=testobj,
+        inputs_list=inputs_list,
+        output_list=output_list,
+        ref_data=ref_data,
+        names=failing_names,
+    )
+    datasets["Pass"] = _save_datatree(
+        testobj=testobj,
+        inputs_list=inputs_list,
+        output_list=output_list,
+        ref_data=ref_data,
+        names=passing_names,
+    )
+    xr.DataTree.from_dict(datasets).to_netcdf(out_filename)
     print(f"File saved to {out_filename}")
-    xr.Dataset(data_vars=data_vars).to_netcdf(out_filename)
diff --git a/ndsl/stencils/testing/translate.py b/ndsl/stencils/testing/translate.py
index e3fc8845..8132d290 100644
--- a/ndsl/stencils/testing/translate.py
+++ b/ndsl/stencils/testing/translate.py
@@ -8,6 +8,7 @@
 from ndsl.dsl.typing import Field, Float, Int  # noqa: F401
 from ndsl.quantity import Quantity
 from ndsl.stencils.testing.grid import Grid  # type: ignore
+from ndsl.stencils.testing.savepoint import DataLoader
 
 
 try:
@@ -51,6 +52,12 @@ def _convert(value: Union[Quantity, np.ndarray]) -> np.ndarray:
 
 
 class TranslateFortranData2Py:
+    """Translate test main class
+
+    The translate test will will test a set of inputs and outputs, after having processed
+    the inputs via the user provided `compute_func`.
+    """
+
     max_error = 1e-14
     near_zero = 1e-18
     mmr_absolute_eps = -1
@@ -69,18 +76,36 @@ def __init__(self, grid, stencil_factory: StencilFactory, origin=utils.origin):
         self.ignore_near_zero_errors: Dict[str, Any] = {}
         self.skip_test: bool = False
 
-    def setup(self, inputs):
+    def extra_data_load(self, data_loader: DataLoader):
+        pass
+
+    def setup(self, inputs) -> None:
+        """Transform inputs to gt4py.storages specification (correct device, layout)"""
         self.make_storage_data_input_vars(inputs)
 
-    def compute_func(self, **inputs):
+    def compute_func(self, **inputs) -> Optional[dict[str, Any]]:
+        """Compute function to transform the dictionary of `inputs`.
+        Must return a dictionnary of updated variables"""
         raise NotImplementedError("Implement a child class compute method")
 
-    def compute(self, inputs):
+    def compute(self, inputs) -> dict[str, Any]:
+        """Transform inputs from NetCDF to gt4py.storagers, run compute_func then slice
+        the outputs based on specifications.
+
+        Return: Dictonnary of storages reshaped for comparison
+        """
         self.setup(inputs)
         return self.slice_output(self.compute_from_storage(inputs))
 
     # assume inputs already has been turned into gt4py storages (or Quantities)
-    def compute_from_storage(self, inputs):
+    def compute_from_storage(self, inputs) -> dict[str, Any]:
+        """Run `compute_func` and return an updated `inputs` dictionary with
+        the returned results of `compute_func`.
+
+        Hypothesis: `inputs` are `gt4py.storages`
+
+        Return: Outputs in the form of a Dict[str, gt4py.storages]
+        """
         outputs = self.compute_func(**inputs)
         if outputs is not None:
             inputs.update(outputs)
@@ -108,7 +133,13 @@ def make_storage_data(
         names_4d: Optional[List[str]] = None,
         read_only: bool = False,
         full_shape: bool = False,
-    ) -> Dict[str, "Field"]:
+    ) -> "Field":
+        """Copy input data into a gt4py.storage with given shape.
+
+        `array` is copied. Takes care of the device upload if necessary.
+
+        Return: Array in the form of a Dict[str, gt4py.storages]
+        """
         use_shape = list(self.maxshape)
         if dummy_axes:
             for axis in dummy_axes:
@@ -167,18 +198,20 @@ def collect_start_indices(self, datashape, varinfo):
         kstart = self.get_index_from_info(varinfo, "kstart", 0)
         return istart, jstart, kstart
 
-    def make_storage_data_input_vars(self, inputs, storage_vars=None, dict_4d=True):
+    def make_storage_data_input_vars(
+        self, inputs, storage_vars=None, dict_4d=True
+    ) -> None:
+        """From a set of raw inputs (straight from NetCDF), use the `in_vars` dictionnary to update inputs to
+        their configured shape.
+
+        Return: None
+        """
         inputs_in = {**inputs}
         inputs_out = {}
         if storage_vars is None:
             storage_vars = self.storage_vars()
         for p in self.in_vars["parameters"]:
-            if type(inputs_in[p]) in [int, np.int64, np.int32]:
-                inputs_out[p] = int(inputs_in[p])
-            elif type(inputs_in[p]) is bool:
-                inputs_out[p] = inputs_in[p]
-            else:
-                inputs_out[p] = Float(inputs_in[p])
+            inputs_out[p] = inputs_in[p]
         for d, info in storage_vars.items():
             serialname = info["serialname"] if "serialname" in info else d
             self.update_info(info, inputs_in)
@@ -214,7 +247,7 @@ def make_storage_data_input_vars(self, inputs, storage_vars=None, dict_4d=True):
         inputs.clear()
         inputs.update(inputs_out)
 
-    def slice_output(self, inputs, out_data=None):
+    def slice_output(self, inputs, out_data=None) -> dict[str, Any]:
         utils.device_sync(backend=self.stencil_factory.backend)
         if out_data is None:
             out_data = inputs
@@ -351,6 +384,7 @@ def _edge_vector_storage(self, varname, axis, max_shape):
             shape=buffer.shape,
             backend=self.backend,
             mask=mask,
+            dtype=self.data[varname].dtype,
         )
 
     def _make_composite_vvar_storage(self, varname, data3d, shape):
@@ -364,6 +398,7 @@ def _make_composite_vvar_storage(self, varname, data3d, shape):
             shape=buffer.shape,
             origin=(1, 1, 0),
             backend=self.backend,
+            dtype=self.data[varname].dtype,
         )
 
     def make_grid_storage(self, pygrid):
@@ -385,6 +420,7 @@ def make_grid_storage(self, pygrid):
                     (shape[0], shape[1], 3),
                     origin=(0, 0, 0),
                     backend=self.backend,
+                    dtype=self.data[key].dtype,
                 )
         for key, axis in TranslateGrid.edge_var_axis.items():
             if key in self.data:
@@ -395,6 +431,7 @@ def make_grid_storage(self, pygrid):
                     axis=axis,
                     read_only=True,
                     backend=self.backend,
+                    dtype=self.data[key].dtype,
                 )
         for key, axis in TranslateGrid.edge_vect_axis.items():
             if key in self.data:
@@ -418,6 +455,7 @@ def make_grid_storage(self, pygrid):
                     start=origin,
                     read_only=True,
                     backend=self.backend,
+                    dtype=value.dtype,
                 )
 
     def python_grid(self):
diff --git a/ndsl/testing/comparison.py b/ndsl/testing/comparison.py
index 5bce02dd..7862904d 100644
--- a/ndsl/testing/comparison.py
+++ b/ndsl/testing/comparison.py
@@ -1,9 +1,25 @@
-from typing import List, Optional, Union
+from typing import Any, List, Optional, Union
 
 import numpy as np
 import numpy.typing as npt
 
 
+def _fixed_width_float_16e(value: np.floating[Any]) -> str:
+    """Account for extra '-' character"""
+    if value > 0:
+        return f" {value:.16e}"
+    else:
+        return f"{value:.16e}"
+
+
+def _fixed_width_float_2e(value: np.floating[Any]) -> str:
+    """Account for extra '-' character"""
+    if value > 0:
+        return f" {value:.2e}"
+    else:
+        return f"{value:.2e}"
+
+
 class BaseMetric:
     def __init__(
         self,
@@ -210,6 +226,7 @@ def __init__(
         absolute_eps_override: float = -1,
         relative_fraction_override: float = -1,
         ulp_override: float = -1,
+        sort_report: str = "ulp",
         **kwargs,
     ):
         super().__init__(reference_values, computed_values)
@@ -235,6 +252,7 @@ def __init__(
 
         self.success = self._compute_all_metrics()
         self.check = np.all(self.success)
+        self.sort_report = sort_report
 
     def _compute_all_metrics(
         self,
@@ -290,9 +308,9 @@ def _has_override(self) -> bool:
         )
 
     def one_line_report(self) -> str:
-        metric_thresholds = f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
-        metric_thresholds += f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
-        metric_thresholds += f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})"
+        metric_thresholds = f"{'🔶 ' if not self.absolute_eps.is_default else ''}Absolute E(<{self.absolute_eps.value:.2e})  "
+        metric_thresholds += f"{'🔶 ' if not self.relative_fraction.is_default else ''}Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+        metric_thresholds += f"{'🔶 ' if not self.ulp_threshold.is_default else ''}ULP E(<{self.ulp_threshold.value})"
         if self.check and self._has_override():
             return f"🔶 No numerical differences with threshold override - metric: {metric_thresholds}"
         elif self.check:
@@ -305,33 +323,53 @@ def one_line_report(self) -> str:
     def report(self, file_path: Optional[str] = None) -> List[str]:
         report = []
         report.append(self.one_line_report())
-        if not self.check:
-            found_indices = np.logical_not(self.success).nonzero()
-            # List all errors to terminal and file
-            bad_indices_count = len(found_indices[0])
-            full_count = len(self.references.flatten())
-            failures_pct = round(100.0 * (bad_indices_count / full_count), 2)
-            report = [
-                f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
-                f"Index   Computed   Reference   "
-                f"{'🔶 ' if not self.absolute_eps.is_default else '' }Absolute E(<{self.absolute_eps.value:.2e})  "
-                f"{'🔶 ' if not self.relative_fraction.is_default else '' }Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
-                f"{'🔶 ' if not self.ulp_threshold.is_default else '' }ULP E(<{self.ulp_threshold.value})",
-            ]
-            # Summary and worst result
-            for iBad in range(bad_indices_count):
-                fi = tuple([f[iBad] for f in found_indices])
-                ulp_dist = (
-                    self.ulp_distance[fi]
-                    if np.isnan(self.ulp_distance[fi])
-                    else int(self.ulp_distance[fi])
-                )
-                report.append(
-                    f"{str(fi)}  {self.computed[fi]:.16e}  {self.references[fi]:.16e}  "
-                    f"{self.absolute_distance[fi]:.2e} {'✅' if self.absolute_distance_metric[fi] else '❌'}  "
-                    f"{self.relative_distance[fi] * 100:.2e} {'✅' if self.relative_distance_metric[fi] else '❌'}  "
-                    f"{ulp_dist:02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
-                )
+        failed_indices = np.logical_not(self.success).nonzero()
+        # List all errors to terminal and file
+        bad_indices_count = len(failed_indices[0])
+        full_count = len(self.references.flatten())
+        failures_pct = round(100.0 * (bad_indices_count / full_count), 2)
+        report = [
+            f"All failures ({bad_indices_count}/{full_count}) ({failures_pct}%),\n",
+            f"Index   Computed   Reference   "
+            f"{'🔶 ' if not self.absolute_eps.is_default else ''}Absolute E(<{self.absolute_eps.value:.2e})  "
+            f"{'🔶 ' if not self.relative_fraction.is_default else ''}Relative E(<{self.relative_fraction.value * 100:.2e}%)   "
+            f"{'🔶 ' if not self.ulp_threshold.is_default else ''}ULP E(<{self.ulp_threshold.value})",
+        ]
+        # Summary and worst result
+        if self.sort_report == "ulp":
+            indices_flatten = np.argsort(self.ulp_distance.flatten())
+        elif self.sort_report == "absolute":
+            indices_flatten = np.argsort(self.absolute_distance.flatten())
+        elif self.sort_report == "relative":
+            indices_flatten = np.argsort(self.relative_distance.flatten())
+        elif self.sort_report == "index":
+            indices_flatten = list(range(self.ulp_distance.size - 1, -1, -1))
+        else:
+            RuntimeError(
+                f"[Translate test] Unknown {self.sort_report} report sorting option."
+            )
+        for iFlat in indices_flatten[::-1]:
+            fi = np.unravel_index(iFlat, shape=self.ulp_distance.shape)
+            if np.isnan(self.computed[fi]) and np.isnan(self.references[fi]):
+                continue
+            ulp_dist = (
+                self.ulp_distance[fi]
+                if np.isnan(self.ulp_distance[fi])
+                else int(self.ulp_distance[fi])
+            )
+            index_as_string = "("
+            for i in fi:
+                index_as_string += f"{i:02},"
+            index_as_string = index_as_string[:-1]
+            index_as_string += ")"
+            report.append(
+                f"{index_as_string}  "
+                f"{_fixed_width_float_16e(self.computed[fi])}  "
+                f"{_fixed_width_float_16e(self.references[fi])}  "
+                f"{_fixed_width_float_2e(self.absolute_distance[fi])} {'✅' if self.absolute_distance_metric[fi] else '❌'}  "
+                f"{_fixed_width_float_2e(self.relative_distance[fi] * 100)} {'✅' if self.relative_distance_metric[fi] else '❌'}  "
+                f"{ulp_dist:02} {'✅' if self.ulp_distance_metric[fi] else '❌'}  "
+            )
 
         if file_path:
             with open(file_path, "w") as fd:
diff --git a/setup.cfg b/setup.cfg
index 1a142931..ce80d954 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,3 +34,8 @@ omit =
         external/*
         __init__.py
 source_pkgs = ndsl
+
+[metadata]
+# Include the license file in the generated wheel, see
+# https://wheel.readthedocs.io/en/stable/user_guide.html#including-license-files-in-the-generated-wheel-file
+license_files = LICENSE.txt
diff --git a/setup.py b/setup.py
index 0b19d38e..e9f1c2e6 100644
--- a/setup.py
+++ b/setup.py
@@ -11,14 +11,17 @@ def local_pkg(name: str, relative_path: str) -> str:
     return path
 
 
-test_requirements = ["pytest", "pytest-subtests", "coverage"]
-develop_requirements = test_requirements + ["pre-commit"]
+docs_requirements = ["mkdocs-material"]
 demos_requirements = ["ipython", "ipykernel"]
+test_requirements = ["pytest", "pytest-subtests", "coverage"]
+
+develop_requirements = test_requirements + docs_requirements + ["pre-commit"]
 
 extras_requires = {
-    "test": test_requirements,
-    "develop": develop_requirements,
     "demos": demos_requirements,
+    "develop": develop_requirements,
+    "docs": docs_requirements,
+    "test": test_requirements,
 }
 
 requirements: List[str] = [
@@ -26,7 +29,7 @@ def local_pkg(name: str, relative_path: str) -> str:
     local_pkg("dace", "external/dace"),
     "mpi4py==3.1.5",
     "cftime",
-    "xarray",
+    "xarray>=2025.01.2",  # datatree + fixes
     "f90nml>=1.1.0",
     "fsspec",
     "netcdf4==1.7.1",
@@ -44,20 +47,19 @@ def local_pkg(name: str, relative_path: str) -> str:
     classifiers=[
         "Development Status :: 2 - Pre-Alpha",
         "Intended Audience :: Developers",
-        "License :: OSI Approved :: BSD License",
+        "License :: OSI Approved :: Apache Software License",
         "Natural Language :: English",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.11",
     ],
     install_requires=requirements,
     extras_require=extras_requires,
     name="ndsl",
-    license="BSD license",
+    license="Apache 2.0 license",
     packages=find_namespace_packages(include=["ndsl", "ndsl.*"]),
     include_package_data=True,
     url="https://github.com/NOAA-GFDL/NDSL",
-    version="2025.01.00",
+    version="2025.03.00",
     zip_safe=False,
     entry_points={
         "console_scripts": [
diff --git a/tests/quantity/test_deepcopy.py b/tests/quantity/test_deepcopy.py
index a7b1564c..d6b5c7cb 100644
--- a/tests/quantity/test_deepcopy.py
+++ b/tests/quantity/test_deepcopy.py
@@ -17,9 +17,7 @@ def test_deepcopy_copy_is_editable_by_view():
     )
     quantity_copy = copy.deepcopy(quantity)
     # assertion below is only valid if we're overwriting the entire data through view
-    assert np.product(quantity_copy.view[:].shape) == np.product(
-        quantity_copy.data.shape
-    )
+    assert np.prod(quantity_copy.view[:].shape) == np.prod(quantity_copy.data.shape)
     quantity_copy.view[:] = 1.0
     np.testing.assert_array_equal(quantity.data, 0.0)
     np.testing.assert_array_equal(quantity_copy.data, 1.0)