NOAA-GFDL · FlorianDeconinck · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 15, 2026
diff --git a/external/dace b/external/dace
diff --git a/external/gt4py b/external/gt4py
diff --git a/ndsl/__init__.py b/ndsl/__init__.py
@@ -10,6 +10,7 @@
 from .constants import ConstantVersions
 from .dsl.caches.codepath import FV3CodePath
 from .quantity import Quantity
+from .dsl.optimization_config import OptimizationConfig
 from .dsl.ndsl_runtime import NDSLRuntime
 from .dsl.stencil import FrozenStencil, GridIndexing, StencilFactory, TimingCollector
 from .dsl.stencil_config import CompilationConfig, RunMode, StencilConfig
@@ -90,6 +91,7 @@
     "MetaEnumStr",
     "State",
     "LocalState",
+    "OptimizationConfig",
     "NDSLRuntime",
     "Local",
     "DiagManagerMonitor",

diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
@@ -786,7 +786,7 @@ def __init__(
                 "Communicator needs to be instantiated with communication subsystem"
                 f" derived from `comm_abc.Comm`, got {type(comm)}."
             )
-        if comm.Get_size() != partitioner.total_ranks:
+        if comm.Get_size() < partitioner.total_ranks:
             raise ValueError(
                 f"was given a partitioner for {partitioner.total_ranks} ranks but a "
                 f"comm object with only {comm.Get_size()} ranks, are we running "

diff --git a/ndsl/config/backend.py b/ndsl/config/backend.py
@@ -52,6 +52,8 @@ class BackendLoopOrder(Enum):
     "orch:dace:cpu:KJI": "dace:cpu_KJI",
     "st:dace:gpu:KJI": "dace:gpu",
     "orch:dace:gpu:KJI": "dace:gpu",
+    "st:dace:gpu:IJK": "dace:gpu_IJK",
+    "orch:dace:gpu:IJK": "dace:gpu_IJK",
 }
 """Internal: match the NDSL backend names with the GT4Py names"""
 

diff --git a/ndsl/dsl/caches/cache_location.py b/ndsl/dsl/caches/cache_location.py
@@ -7,46 +7,48 @@ def identify_code_path(
     partitioner: Partitioner,
     single_code_path: bool,
 ) -> FV3CodePath:
-    """Determine which code path your rank will hit.
+    """
+    Determine which code path your rank will hit.
 
-    If single_code_path is True, single_code_path is True,
-    only one code path exists (case of doubly periodic grid).
+    If single_code_path is True, only one code path exists,
+    e.g. in case of a doubly periodic grid.
     If single_code_path is False, we are in the case of the
-    cube-sphere and we will look at our position on the tile."""
+    cube-sphere and we will look at our position on the tile.
+    """
 
     # Doubly-periodic or single tile grid
-    if single_code_path:
+    if single_code_path or partitioner.layout == (1, 1):
         return FV3CodePath.All
 
     # Cube-sphere
-    if partitioner.layout == (1, 1):
-        return FV3CodePath.All
-    elif partitioner.layout[0] == 1 or partitioner.layout[1] == 1:
+    if partitioner.layout[0] <= 1 or partitioner.layout[1] <= 1:
         raise NotImplementedError(
-            f"Build for layout {partitioner.layout} is not handled"
+            f"Build for layout {partitioner.layout} is not handled."
         )
-    else:
-        if partitioner.tile.on_tile_bottom(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return FV3CodePath.BottomLeft
-            if partitioner.tile.on_tile_right(rank):
-                return FV3CodePath.BottomRight
-            else:
-                return FV3CodePath.Bottom
-        if partitioner.tile.on_tile_top(rank):
-            if partitioner.tile.on_tile_left(rank):
-                return FV3CodePath.TopLeft
-            if partitioner.tile.on_tile_right(rank):
-                return FV3CodePath.TopRight
-            else:
-                return FV3CodePath.Top
-        else:
-            if partitioner.tile.on_tile_left(rank):
-                return FV3CodePath.Left
-            if partitioner.tile.on_tile_right(rank):
-                return FV3CodePath.Right
-            else:
-                return FV3CodePath.Center
+
+    # Bottom row
+    if partitioner.tile.on_tile_bottom(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return FV3CodePath.BottomLeft
+        if partitioner.tile.on_tile_right(rank):
+            return FV3CodePath.BottomRight
+        return FV3CodePath.Bottom
+
+    # Top row
+    if partitioner.tile.on_tile_top(rank):
+        if partitioner.tile.on_tile_left(rank):
+            return FV3CodePath.TopLeft
+        if partitioner.tile.on_tile_right(rank):
+            return FV3CodePath.TopRight
+        return FV3CodePath.Top
+
+    # Left & right column with corners already handled
+    if partitioner.tile.on_tile_left(rank):
+        return FV3CodePath.Left
+    if partitioner.tile.on_tile_right(rank):
+        return FV3CodePath.Right
+
+    return FV3CodePath.Center
 
 
 def get_cache_fullpath(code_path: FV3CodePath) -> str:

diff --git a/ndsl/dsl/caches/codepath.py b/ndsl/dsl/caches/codepath.py
@@ -3,10 +3,12 @@
 
 class FV3CodePath(enum.Enum):
     """Enum listing all possible code paths on a cube sphere.
+
     For any layout the cube sphere has up to 9 different code paths depending on
     the positioning of the rank on the tile and which of the edge/corner cases
     it has to handle, as well as the possibility for all boundary computations in
     the 1x1 layout case.
+
     Since the framework inlines code to optimize, we _cannot_ pre-suppose which code
     being kept and/or ejected. This enum serves as the ground truth to map rank to
     the proper generated code.

diff --git a/ndsl/dsl/dace/dace_config.py b/ndsl/dsl/dace/dace_config.py
@@ -10,14 +10,20 @@
 from gt4py.cartesian.utils.compiler import cxx_compiler_defaults, gpu_configuration
 
 from ndsl import LocalComm
+from ndsl.comm import Comm
 from ndsl.comm.communicator import Communicator
 from ndsl.comm.partitioner import Partitioner
 from ndsl.config import Backend
 from ndsl.dsl import NDSL_COMPILER_SILENCE, NDSL_GLOBAL_PRECISION
 from ndsl.dsl.caches.cache_location import identify_code_path
 from ndsl.dsl.caches.codepath import FV3CodePath
+from ndsl.dsl.dace.hardware_config import get_gpu_hardware_defaults
 from ndsl.optional_imports import cupy as cp
-from ndsl.performance.collector import NullPerformanceCollector, PerformanceCollector
+from ndsl.performance.collector import (
+    AbstractPerformanceCollector,
+    NullPerformanceCollector,
+    PerformanceCollector,
+)
 
 
 if TYPE_CHECKING:
@@ -166,8 +172,8 @@ def __init__(
         Args:
             communicator: used for setting the distributed caches
             backend: string for the backend
-            tile_nx: x/y domain size for a single time
-            tile_nz: z domain size for a single time
+            tile_nx: x/y domain size for a single tile
+            tile_nz: z domain size for a single tile
             orchestration: orchestration mode from DaCeOrchestration
             time: trigger performance collection, available to user with
                 `performance_collector`
@@ -181,16 +187,12 @@ def __init__(
         # ToDo: DaceConfig becomes a bit more than a read-only config
         #       with this. Should be refactored into a DaceExecutor carrying a config
         self.loaded_dace_executables: DaceExecutables = {}
-        self.performance_collector = (
-            PerformanceCollector(
-                "InternalOrchestrationTimer",
-                comm=(
-                    LocalComm(0, 6, {}) if communicator is None else communicator.comm
-                ),
+        if not time:
+            self.performance_collector: AbstractPerformanceCollector = (
+                NullPerformanceCollector()
             )
-            if time
-            else NullPerformanceCollector()
-        )
+        else:
+            self.set_timer(communicator.comm if communicator else None)
 
         # Temporary. This is a bit too out of the ordinary for the common user.
         # We should refactor the architecture to allow for a `gtc:orchestrated:dace:X`
@@ -265,21 +267,29 @@ def __init__(
             march_option = "-mcpu=native" if is_arm_neoverse else "-march=native"
             # Removed --fast-math
             gpu_config = gpu_configuration(GT4PY_COMPILE_OPT_LEVEL)
+            gpu_cflags = " ".join(gpu_config.gpu_compile_flags).strip()
             dace.config.Config.set(
                 "compiler",
                 "cuda",
                 "args",
-                value=f"-std=c++14 {warnings_policy} -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_config.gpu_compile_flags}",
+                value=f"-std=c++14 {warnings_policy} -Xcompiler -fPIC -O{optimization_level} -Xcompiler {march_option} {gpu_cflags}",
             )
 
-            cuda_sm = cp.cuda.Device(0).compute_capability if cp else 60
-            dace.config.Config.set("compiler", "cuda", "cuda_arch", value=f"{cuda_sm}")
-            # Block size/thread count is defaulted to an average value for recent
-            # hardware (Pascal and upward). The problem of setting an optimized
-            # block/thread is both hardware and problem dependant. Fine tuners
-            # available in DaCe should be relied on for further tuning of this value.
+            # Target compilation for hardware micro-code capacities
+            gpu_defaults = get_gpu_hardware_defaults()
             dace.config.Config.set(
-                "compiler", "cuda", "default_block_size", value="64,8,1"
+                "compiler",
+                "cuda",
+                "cuda_arch",
+                value=f"{gpu_defaults.compute_capability}",
+            )
+
+            # Default block size for kernels launch
+            dace.config.Config.set(
+                "compiler",
+                "cuda",
+                "default_block_size",
+                value=str(gpu_defaults.block_size)[1:-1],
             )
             # Potentially buggy - deactivate
             dace.config.Config.set(
@@ -346,6 +356,9 @@ def __init__(
                     value="c",
                 )
 
+            # Debug lineinfo is incorrect anyway for the stencils
+            dace.config.Config.set("compiler", "lineinfo", value="none")
+
         # Attempt to kill the dace.conf to avoid confusion
         dace_conf_to_kill = dace.config.Config.cfg_filename()
         if dace_conf_to_kill is not None:
@@ -413,4 +426,20 @@ def from_dict(cls, data: dict) -> Self:
         config.rank_size = data["rank_size"]
         config.layout = data["layout"]
         config.tile_resolution = data["tile_resolution"]
-        return config
+        # TODO
+        # Computed properties like `self.code_path` and `self.do_compile`
+        # aren't updated.
+        # We also don't `set_distributed_caches()` based on that updated
+        # information.
+        raise NotImplementedError(
+            "Implementation of `DaceConfig.from_dict()` is incomplete."
+        )
+
+    def set_timer(self, comm: Comm | None) -> None:
+        """Set timer on configuration externally"""
+        # TODO: this absolutely should not be a on a Configuration object
+        #      and even less setup outside. Madness, we have lost our ways...
+        self.performance_collector = PerformanceCollector(
+            "InternalOrchestrationTimer",
+            comm=(LocalComm(0, 6, {}) if comm is None else comm),
+        )
diff --git a/ndsl/dsl/dace/hardware_config.py b/ndsl/dsl/dace/hardware_config.py
@@ -0,0 +1,126 @@
+import dataclasses
+import sys
+from pathlib import Path
+from typing import Literal
+
+from ndsl import ndsl_log
+from ndsl.optional_imports import cupy as cp
+
+
+GPUVendor = Literal["Nvidia"] | Literal["AMD"] | Literal["Intel"] | Literal["Unknown"]
+
+# Taken straight out of https://pcisig.com/membership/member-companies
+_VENDOR_PCI_SIGNATURES: dict[int, GPUVendor] = {
+    0x10DE: "Nvidia",
+    0x1002: "AMD",
+    0x8086: "Intel",
+    0x0: "Unknown",
+}
+
+# Cached copy of the hardware default
+_GPU_HARDWARE_DEFAULTS = None
+
+
+def _get_vendor() -> GPUVendor:
+    """Retrieve vendor using the current device PCI id to query the PCI vendor
+    from the kernel logs.
+
+    ⚠️ Only works on Linux - kicks back to "Unknown" in other cases.
+    """
+    if not sys.platform.startswith("linux"):
+        ndsl_log.info("GPU hardware detection only possible on Linux system.")
+        return "Unknown"
+
+    pci_device_id = cp.cuda.runtime.deviceGetPCIBusId(0)
+    dev_path = Path("/sys", "bus", "pci", "devices", f"{pci_device_id}")
+    if not dev_path.exists():
+        ndsl_log.info(f"GPU detection: PCI device not found at {dev_path}.")
+        return "Unknown"
+
+    with open(dev_path / "vendor", "r") as f:
+        vendor_str = f.read().strip().replace("0x", "")
+        vendor_id = int(vendor_str, 16)
+
+    if vendor_id not in _VENDOR_PCI_SIGNATURES:
+        ndsl_log.error(f"Unknown GPU vendor with PCI-SIG ID of {vendor_id:#X}.")
+        return "Unknown"
+
+    return _VENDOR_PCI_SIGNATURES[vendor_id]
+
+
+@dataclasses.dataclass
+class GPUHardwareDefaults:
+    """Compute defaults for common GPUs"""
+
+    vendor: GPUVendor
+    block_size: list[int] = dataclasses.field(default_factory=list)
+    compute_capability: int = -1  # Nvidia specific
+
+
+def get_gpu_hardware_defaults() -> GPUHardwareDefaults:
+    """Retrieve default values for GPU computation configuration."""
+    global _GPU_HARDWARE_DEFAULTS
+    if _GPU_HARDWARE_DEFAULTS is not None:
+        return _GPU_HARDWARE_DEFAULTS  # type: ignore[unreachable]
+
+    if cp is None or not cp.cuda.is_available():
+        ndsl_log.warning("No cupy - defaulting for GPU hardware")
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor="Unknown",
+            block_size=[
+                8,
+                1,
+                1,
+            ],  # Smaller common denominator of massively parallel hardware
+        )
+        return _GPU_HARDWARE_DEFAULTS
+
+    # Who goes there
+    vendor = _get_vendor()
+    if vendor == "Nvidia":
+        compute_capability = int(cp.cuda.Device(0).compute_capability)
+        # Default block size based on compute capability
+        if compute_capability > 80:
+            # Covers:
+            #  - Blackwell (100+)
+            #  - Hopper (90-100)
+            #  - Ampere (80-90)
+            block_sizes = [128, 1, 1]
+        elif compute_capability > 60:
+            # Covers:
+            #  - Volta (70-80)
+            #  - Pascal (60-70)
+            block_sizes = [64, 8, 1]
+        else:
+            # For older hardware - we default to the safe warp-size since
+            # the dawn of GPGPU on Nvidia hardware
+            block_sizes = [32, 1, 1]
+
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor,
+            block_size=block_sizes,
+            compute_capability=compute_capability,
+        )
+    elif vendor == "AMD":
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor,
+            block_size=[64, 1, 1],  # Default RDNA architecture is Wave64
+        )
+    elif vendor == "Intel":
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor,
+            block_size=[32, 1, 1],  # Intel can run 8, 16 or 32 - but SIMD betters in 32
+        )
+    else:
+        _GPU_HARDWARE_DEFAULTS = GPUHardwareDefaults(
+            vendor=vendor,
+            block_size=[
+                8,
+                1,
+                1,
+            ],  # Smaller common denominator of massively parallel hardware
+        )
+
+    ndsl_log.info(f"GPU vendor detected: {_GPU_HARDWARE_DEFAULTS.vendor}")
+
+    return _GPU_HARDWARE_DEFAULTS
+1 −1		.github/workflows/general-ci.yml
+1 −1		.github/workflows/ml-ci.yml
+4 −1		dace/codegen/compiled_sdfg.py
+36 −32		dace/codegen/control_flow.py
+1 −1		dace/codegen/instrumentation/papi.py
+5 −2		dace/codegen/targets/cuda.py
+11 −10		dace/codegen/targets/framecode.py
+1 −1		dace/config.py
+4 −1		dace/data/ctypes_interop.py
+2 −2		dace/dtypes.py
+4 −1		dace/frontend/python/newast.py
+3 −2		dace/frontend/python/parser.py
+1 −0		dace/frontend/python/replacements/array_creation.py
+1 −0		dace/frontend/python/replacements/array_manipulation.py
+1 −0		dace/frontend/python/replacements/array_metadata.py
+1 −0		dace/frontend/python/replacements/linalg.py
+3 −2		dace/frontend/python/replacements/misc.py
+1 −0		dace/frontend/python/replacements/reduction.py
+1 −0		dace/frontend/python/replacements/ufunc.py
+1 −1		dace/memlet.py
+114 −4		dace/runtime/include/dace/math.h
+115 −48		dace/sdfg/analysis/schedule_tree/tree_to_sdfg.py
+29 −3		dace/sdfg/analysis/schedule_tree/treenodes.py
+1 −1		dace/sdfg/analysis/vector_inference.py
+1 −0		dace/sdfg/infer_types.py
+10 −3		dace/sdfg/nodes.py
+296 −97		dace/sdfg/propagation.py
+94 −14		dace/sdfg/sdfg.py
+203 −18		dace/sdfg/state.py
+3 −0		dace/sdfg/utils.py
+35 −35		dace/subsets.py
+1 −0		dace/transformation/dataflow/add_threadblock_map.py
+69 −22		dace/transformation/dataflow/map_fission.py
+38 −23		dace/transformation/dataflow/redundant_array.py
+1 −1		dace/transformation/dataflow/sve/infer_types.py
+155 −47		dace/transformation/helpers.py
+12 −1		dace/transformation/interstate/state_fusion_with_happens_before.py
+7 −5		dace/transformation/passes/analysis/analysis.py
+511 −0		dace/transformation/passes/loop_to_reduce.py
+8 −15		dace/transformation/passes/reference_reduction.py
+32 −1		doc/conf.py
+218 −0		doc/extensions/backend.rst
+9 −8		doc/extensions/extensions.rst
+129 −0		doc/extensions/frontend.rst
+113 −0		doc/extensions/instrumentation.rst
+393 −0		doc/extensions/libraries.rst
+111 −0		doc/extensions/sdfgconvertible.rst
+133 −0		doc/extensions/symbolic.rst
+1 −1		doc/frontend/daceprograms.rst
+4 −6		doc/frontend/parsing.rst
+129 −0		doc/frontend/preprocessing.rst
+56 −14		doc/frontend/pysupport.rst
+1 −3		doc/frontend/python.rst
+51 −0		doc/general/faq.rst
+1 −1		doc/general/glossary.rst
+1 −1		doc/general/structure.rst
+8 −2		doc/index.rst
+84 −0		doc/optimization/guidelines.rst
+112 −0		doc/optimization/interactive.rst
+3 −5		doc/optimization/optimization.rst
+1 −4		doc/sdfg/ir.rst
+177 −0		doc/sdfg/schedule_tree.rst
+1 −1		doc/setup/integration.rst
+1 −1		doc/setup/quickstart.rst
+2 −2		doc/source/dace.cli.rst
+1 −1		doc/source/dace.codegen.instrumentation.rst
+0 −9		doc/source/dace.rst
+1 −1		tests/codegen/allocation_lifetime_test.py
+2 −1		tests/codegen/control_flow_generation_test.py
+27 −0		tests/codegen/gpu_min_warps_per_eu_test.py
+2 −1		tests/graph_test.py
+134 −0		tests/memlet_propagation_squeezing_test.py
+1 −1		tests/numpy/common.py
+406 −0		tests/passes/loop_to_reduce_test.py
+26 −21		tests/passes/writeset_underapproximation_test.py
+2 −2		tests/schedule_tree/naming_test.py
+3 −4		tests/schedule_tree/schedule_test.py
+142 −1		tests/schedule_tree/to_sdfg_test.py
+32 −1		tests/schedule_tree/treenodes_test.py
+17 −10		tests/sdfg/data/container_array_test.py
+50 −37		tests/sdfg/data/structure_test.py
+29 −4		tests/sdfg/reference_test.py
+1 −1		tests/state_transition_test.py
+105 −0		tests/transformations/helpers_test.py
+6 −6		tests/transformations/loop_to_map_test.py
+147 −1		tests/transformations/map_fission_test.py
+1 −1		tutorials/getting_started.ipynb