From f543dc183feef8fa27a1bc0508f445a1f4437113 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 15 Jul 2024 20:39:25 +0200
Subject: [PATCH 001/126] [llvm-to-ptx][libkernel] Add support for select
 non-relaxed atomic operations

---
 src/libkernel/sscp/ptx/atomic.cpp | 57 +++++++++++++++++++++++++++++--
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/src/libkernel/sscp/ptx/atomic.cpp b/src/libkernel/sscp/ptx/atomic.cpp
index 306c26bb6..66c265367 100644
--- a/src/libkernel/sscp/ptx/atomic.cpp
+++ b/src/libkernel/sscp/ptx/atomic.cpp
@@ -455,6 +455,16 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i16(
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr, __acpp_int32 x) {
+  if(scope == __acpp_sscp_memory_scope::system) {
+    if(order == __acpp_sscp_memory_order::release) {
+      asm volatile("st.release.generic.i32 [%0], %1;" 
+                   :
+                   :"l"(ptr), "r"(x)
+                   : "memory");
+      return;
+    }
+  }
+
   *ptr = x;
   mem_fence(scope);
 }
@@ -484,6 +494,17 @@ HIPSYCL_SSCP_BUILTIN __acpp_int16 __acpp_sscp_atomic_load_i16(
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_load_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr) {
+  if(scope == __acpp_sscp_memory_scope::system) {
+    if(order == __acpp_sscp_memory_order::acquire) {
+      __acpp_int32 result;
+      asm volatile("ld.acquire.generic.sys.i32 %0,[%1];"
+                   : "=r"(result)
+                   : "l"(ptr)
+                   : "memory");
+      return result;
+    }
+  }
+
   return *ptr;
 }
 
@@ -622,7 +643,18 @@ HIPSYCL_SSCP_BUILTIN bool __acpp_sscp_cmp_exch_strong_i32(
 
   __acpp_int32 old = *expected;
   if (scope == __acpp_sscp_memory_scope::system) {
-    *expected = __iAtomicCAS_system(ptr, *expected, desired);
+    if (success == __acpp_sscp_memory_order::acquire &&
+        failure == __acpp_sscp_memory_order::acquire) {
+      __acpp_int32 compare = *expected;
+      __acpp_int32 result;
+      asm volatile("atom.acquire.cas.generic.sys.i32 %0,[%1],%2,%3;"
+                   : "=r"(result)
+                   : "l"(ptr), "r"(compare), "r"(desired)
+                   : "memory");
+      *expected = result;
+    } else {
+      *expected = __iAtomicCAS_system(ptr, *expected, desired);
+    }
   } else if (scope == __acpp_sscp_memory_scope::device) {
     *expected = __iAtomicCAS(ptr, *expected, desired);
   } else /* work group, sub group or work item */ {
@@ -822,8 +854,18 @@ HIPSYCL_SSCP_BUILTIN __acpp_int16 __acpp_sscp_atomic_fetch_add_i16(
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_fetch_add_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr, __acpp_int32 x) {
+  
   if (scope == __acpp_sscp_memory_scope::system) {
-    return __iAtomicAdd_system(ptr, x);
+    if(order == __acpp_sscp_memory_order::acq_rel) {
+      __acpp_int32 result;
+      asm volatile("atom.add.acq_rel.sys.i32 %0,[%1],%2;"
+                          : "=r"(result)
+                          : "l"(ptr), "r"(x)
+                          : "memory");
+      return result;
+    }
+    else  
+      return __iAtomicAdd_system(ptr, x);
   } else if (scope == __acpp_sscp_memory_scope::device) {
     return __iAtomicAdd(ptr, x);
   } else /* work group, sub group or work item */ {
@@ -863,7 +905,16 @@ HIPSYCL_SSCP_BUILTIN __acpp_uint32 __acpp_sscp_atomic_fetch_add_u32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_uint32 *ptr, __acpp_uint32 x) {
   if (scope == __acpp_sscp_memory_scope::system) {
-    return __uAtomicAdd_system(ptr, x);
+    if(order == __acpp_sscp_memory_order::acq_rel) {
+      __acpp_uint32 result;
+      asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;"
+                          : "=r"(result)
+                          : "l"(ptr), "r"(x)
+                          : "memory");
+      return result;
+    }
+    else  
+      return __uAtomicAdd_system(ptr, x);
   } else if (scope == __acpp_sscp_memory_scope::device) {
     return __uAtomicAdd(ptr, x);
   } else /* work group, sub group or work item */ {

From 4ed62e6380d0114edb7319e0c65201da821d2a12 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 18 Jul 2024 19:55:13 +0200
Subject: [PATCH 002/126] [CI] Enable running sycl_tests with SSCP on
 self-hosted runners

---
 .github/workflows/linux-self-hosted.yml | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/linux-self-hosted.yml b/.github/workflows/linux-self-hosted.yml
index 0a777fdf2..8e2544b41 100644
--- a/.github/workflows/linux-self-hosted.yml
+++ b/.github/workflows/linux-self-hosted.yml
@@ -60,6 +60,10 @@ jobs:
         echo "Running tests on CUDA..."
         cd ${GITHUB_WORKSPACE}/build/tests-cuda-emp
         ACPP_VISIBILITY_MASK="omp;cuda" ./sycl_tests
+    - name: run CUDA tests (SSCP)
+      run: |
+        cd ${GITHUB_WORKSPACE}/build/tests-sscp
+        ACPP_VISIBILITY_MASK="omp;cuda" ./sycl_tests
     - name: run PSTL CUDA tests (integrated multipass)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-cuda
@@ -80,7 +84,7 @@ jobs:
     - name: run CPU tests (SSCP)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
-        ACPP_VISIBILITY_MASK=omp LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/build/install/lib ./sycl_tests -t '!group_functions_tests/*' -t '!extension_tests/*' -t '!kernel_invocation_tests/hierarchical*'
+        ACPP_VISIBILITY_MASK=omp LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/build/install/lib ./sycl_tests
     - name: run PSTL CPU tests (SSCP)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
@@ -129,6 +133,11 @@ jobs:
         echo "Running tests on AMD..."
         cd ${GITHUB_WORKSPACE}/build/tests-rocm-emp
         ACPP_VISIBILITY_MASK="omp;hip" ./sycl_tests
+    - name: run ROCm tests (SSCP)
+      run: |
+        echo "Running tests on AMD..."
+        cd ${GITHUB_WORKSPACE}/build/tests-sscp
+        ACPP_VISIBILITY_MASK="omp;hip" ./sycl_tests
     - name: run PSTL ROCm tests (SSCP)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
@@ -148,19 +157,20 @@ jobs:
         cmake -DCMAKE_CXX_COMPILER=/usr/bin/clang++-${{matrix.clang_version}} -DCLANG_EXECUTABLE_PATH=/usr/bin/clang++-${{matrix.clang_version}} -DLLVM_DIR=/usr/lib/llvm-${{matrix.clang_version}}/cmake -DWITH_LEVEL_ZERO_BACKEND=ON -DWITH_OPENCL_BACKEND=ON -DCMAKE_INSTALL_PREFIX=`pwd`/install ..
         make -j3 install
     - name: build generic SSCP tests
-      if: matrix.clang_version >= 14
       run: |
         mkdir ${GITHUB_WORKSPACE}/build/tests-sscp
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
         cmake -DACPP_TARGETS="generic" -DAdaptiveCpp_DIR=${GITHUB_WORKSPACE}/build/install/lib/cmake/AdaptiveCpp -DWITH_PSTL_TESTS=ON ${GITHUB_WORKSPACE}/tests
         make pstl_tests -j3
+    - name: run Intel tests (OpenCL)
+      run: |
+        cd ${GITHUB_WORKSPACE}/build/tests-sscp
+        LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/ ACPP_VISIBILITY_MASK="omp;ocl:Graphics.*" ./sycl_tests
     - name: run PSTL Intel tests (L0)
-      if: matrix.clang_version >= 14
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
         ACPP_VISIBILITY_MASK="omp;ze" ./pstl_tests
     - name: run PSTL Intel tests (OpenCL)
-      if: matrix.clang_version >= 14
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
         LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/ ACPP_VISIBILITY_MASK="omp;ocl:Graphics.*" ./pstl_tests

From c045c70e07a3e4000e97e01b8520ea331c12d125 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 18 Jul 2024 23:31:34 +0200
Subject: [PATCH 003/126] [ci] Intel: Also build sycl_tests

---
 .github/workflows/linux-self-hosted.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux-self-hosted.yml b/.github/workflows/linux-self-hosted.yml
index 8e2544b41..188b5a8c7 100644
--- a/.github/workflows/linux-self-hosted.yml
+++ b/.github/workflows/linux-self-hosted.yml
@@ -161,7 +161,7 @@ jobs:
         mkdir ${GITHUB_WORKSPACE}/build/tests-sscp
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
         cmake -DACPP_TARGETS="generic" -DAdaptiveCpp_DIR=${GITHUB_WORKSPACE}/build/install/lib/cmake/AdaptiveCpp -DWITH_PSTL_TESTS=ON ${GITHUB_WORKSPACE}/tests
-        make pstl_tests -j3
+        make -j3
     - name: run Intel tests (OpenCL)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp

From 65b853ca62c41b4a8459ba09f056d96f61dedfcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Tue, 30 Jul 2024 17:10:00 +0200
Subject: [PATCH 004/126] [Doc] fix indentation issue in markdown (#1548)

---
 doc/compilation.md   | 36 ++++++++++++++++++------------------
 doc/installing.md    | 10 +++++-----
 doc/using-hipsycl.md | 32 ++++++++++++++++++--------------
 3 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/doc/compilation.md b/doc/compilation.md
index be75af9a0..23507c975 100644
--- a/doc/compilation.md
+++ b/doc/compilation.md
@@ -3,18 +3,18 @@
 AdaptiveCpp supports multiple types of compilation flows:
 
 1. **A generic, single-pass compiler infrastructure that compiles kernels to a unified code representation** that is then lowered at runtime to target devices, providing a high degree of portability, low compilation times, flexibility and extensibility. **AdaptiveCpp is the only major SYCL implementation that supports a single-pass compiler design, where the code is only parsed once for both host and target devices**. Support includes:
-   1. NVIDIA CUDA GPUs through PTX;
-   2. AMD ROCm GPUs through amdgcn code;
-   3. Intel GPUs through SPIR-V (Level Zero);
-   4. SPIR-V compatible OpenCL devices supporting Intel USM extensions or fine-grained system SVM (such as Intel's OpenCL implementation for CPUs or GPUs);
-   5. The host CPU through LLVM
+    1. NVIDIA CUDA GPUs through PTX;
+    2. AMD ROCm GPUs through amdgcn code;
+    3. Intel GPUs through SPIR-V (Level Zero);
+    4. SPIR-V compatible OpenCL devices supporting Intel USM extensions or fine-grained system SVM (such as Intel's OpenCL implementation for CPUs or GPUs);
+    5. The host CPU through LLVM
 2. Interoperability-focused multipass compilation flows. **AdaptiveCpp can aggregate existing clang toolchains and augment them with support for SYCL constructs**. This allows for a high degree of interoperability between SYCL and other models such as CUDA or HIP. For example, in this mode, the AdaptiveCpp CUDA and ROCm backends rely on the clang CUDA/HIP frontends that have been augmented by AdaptiveCpp to *additionally* also understand other models like SYCL. This means that the AdaptiveCpp compiler can not only compile SYCL code, but also CUDA/HIP code *even if they are mixed in the same source file*, making all CUDA/HIP features - such as the latest device intrinsics - also available from SYCL code ([details](hip-source-interop.md)). Additionally, vendor-optimized template libraries such as rocPRIM or CUB can also be used with AdaptiveCpp. This allows for highly optimized code paths in SYCL code for specific devices. Support includes:
-   1. Any LLVM-supported CPU (including e.g. x86, arm, power etc) through the regular clang host toolchain with dedicated compiler transformation to accelerate SYCL constructs;
-   2. NVIDIA CUDA GPUs through the clang CUDA toolchain;
-   3. AMD ROCm GPUs through the clang HIP toolchain
+    1. Any LLVM-supported CPU (including e.g. x86, arm, power etc) through the regular clang host toolchain with dedicated compiler transformation to accelerate SYCL constructs;
+    2. NVIDIA CUDA GPUs through the clang CUDA toolchain;
+    3. AMD ROCm GPUs through the clang HIP toolchain
 3. Or **AdaptiveCpp can be used in library-only compilation flows**. In these compilation flows, AdaptiveCpp acts as a C++ library for third-party compilers. This can have portability advantages or simplify deployment. This includes support for:
-   1. Any CPU supported by any OpenMP compilers;
-   2. NVIDIA GPUs through CUDA and the NVIDIA nvc++ compiler, bringing NVIDIA vendor support and day 1 hardware support to the SYCL ecosystem
+    1. Any CPU supported by any OpenMP compilers;
+    2. NVIDIA GPUs through CUDA and the NVIDIA nvc++ compiler, bringing NVIDIA vendor support and day 1 hardware support to the SYCL ecosystem
 
 The following illustration shows the complete stack and its capabilities to target hardware:
 ![Compiler stack](img/stack.png)
@@ -68,14 +68,14 @@ AdaptiveCpp allows using backend-specific language extensions (e.g. CUDA/HIP C++
 
 * If a backend runs on a compiler that provides a unified, single compilation pass for both host and device, backend-specific language extensions are always available. Currently this only affects the CUDA-nvc++ backend.
 * If the compiler relies on separate compilation passes for host and device:
-  * In device compilation passes, backend-specific language extensions are always available.
-  * In host compilation passes, the following applies:
-    * If the backend runs in integrated multipass mode, backend-specific language extensions are available.
-    * If the backend runs in explicit multipass mode:
-      * For SPIR-V, language extensions are always available
-      * For CUDA and HIP: Language extensions from *one* of them are available in the host pass.
-        * If one of them runs in integrated multipass and one in explicit multipass, language extensions from the one in integrated multipass are available
-        * If both are in explicit multipass, `acpp` will currently automatically pick one that will have language extensions enabled in the host pass.
+    * In device compilation passes, backend-specific language extensions are always available.
+    * In host compilation passes, the following applies:
+        * If the backend runs in integrated multipass mode, backend-specific language extensions are available.
+        * If the backend runs in explicit multipass mode:
+            * For SPIR-V, language extensions are always available
+            * For CUDA and HIP: Language extensions from *one* of them are available in the host pass.
+                * If one of them runs in integrated multipass and one in explicit multipass, language extensions from the one in integrated multipass are available
+                * If both are in explicit multipass, `acpp` will currently automatically pick one that will have language extensions enabled in the host pass.
 
 
 ## Summary of supported compilation targets
diff --git a/doc/installing.md b/doc/installing.md
index 3aab66db8..80cc5c935 100644
--- a/doc/installing.md
+++ b/doc/installing.md
@@ -14,8 +14,8 @@ In order to successfully build and install AdaptiveCpp, the following dependenci
 * python 3 (for the `acpp` compiler driver)
 * `cmake`
 * the Boost C++ libraries (in particular `boost.fiber`, `boost.context` and for the unit tests `boost.test`)
-  * it may be helpful to set the `BOOST_ROOT` `cmake` variable to the path to the root directory of Boost you wish to use if `cmake` does not find it automatically
-  * **Note for boost 1.78 users:** There seems to be a bug in the build system for boost 1.78, causing the compiled fiber and context libraries not to be copied to the installation directory. You will have to copy these libraries manually to the installation directory. In binary packages from some distribution repositories this issue is fixed. You might be only affected when building boost manually from source.
+    * it may be helpful to set the `BOOST_ROOT` `cmake` variable to the path to the root directory of Boost you wish to use if `cmake` does not find it automatically
+    * **Note for boost 1.78 users:** There seems to be a bug in the build system for boost 1.78, causing the compiled fiber and context libraries not to be copied to the installation directory. You will have to copy these libraries manually to the installation directory. In binary packages from some distribution repositories this issue is fixed. You might be only affected when building boost manually from source.
 
 In addition, the various supported [compilation flows](compilation.md) and programming models have additional requirements:
 
@@ -106,9 +106,9 @@ The default installation prefix is `/usr/local`. Change this to your liking.
 ###### General
 *  `-DCMAKE_CXX_COMPILER` should be pointed to the C++ compiler to compile AdaptiveCpp with. Note that this also sets the default C++ compiler for the CPU backend when using acpp once AdaptiveCpp is installed. This can however also be modified later using `HIPSYCL_CPU_CXX`.
 * `-DACPP_COMPILER_FEATURE_PROFILE` can be used to configure the desired degree of compiler support. Supported values:
-  * `full` (default and recommended): Enables all AdaptiveCpp features, requires a compatible LLVM installation as described [here](install-llvm.md). This is recommended for both functionality and performance.
-  * `minimal`: Only enables the older interoperability-focused compilation flows for CUDA and HIP (`--acpp-targets=cuda` and `--acpp-targets=hip`). No OpenCL or Level Zero support, no C++ standard parallelism offloading support, no generic JIT compiler (`generic` target), no compiler acceleration for SYCL constructs on CPU device. **Should only be selected in specific circumstances.**
-  * `none`: Disables all compiler support and dependencies on LLVM. In addition to `minimal`, also disables the support for `--acpp-targets=cuda` and `--acpp-targets=hip`. In this mode, AdaptiveCpp operates purely as a library for third-party compilers. **Should only be selected in specific circumstances.**
+    * `full` (default and recommended): Enables all AdaptiveCpp features, requires a compatible LLVM installation as described [here](install-llvm.md). This is recommended for both functionality and performance.
+    * `minimal`: Only enables the older interoperability-focused compilation flows for CUDA and HIP (`--acpp-targets=cuda` and `--acpp-targets=hip`). No OpenCL or Level Zero support, no C++ standard parallelism offloading support, no generic JIT compiler (`generic` target), no compiler acceleration for SYCL constructs on CPU device. **Should only be selected in specific circumstances.**
+    * `none`: Disables all compiler support and dependencies on LLVM. In addition to `minimal`, also disables the support for `--acpp-targets=cuda` and `--acpp-targets=hip`. In this mode, AdaptiveCpp operates purely as a library for third-party compilers. **Should only be selected in specific circumstances.**
 
 ###### generic
 
diff --git a/doc/using-hipsycl.md b/doc/using-hipsycl.md
index 14e1ac0f6..25020eba6 100644
--- a/doc/using-hipsycl.md
+++ b/doc/using-hipsycl.md
@@ -26,35 +26,39 @@ and can be passed either as `acpp` command line argument (`--acpp-targets=...`),
 Whether a compilation flow needs to be followed by a target list or not varies between the available flows and is described below.
 
 For the following compilation flows, targets cannot be specified:
+
 * `omp.*`
 * `generic`
 
 For the following compilation flows, targets can optionally be specified:
+
 * `cuda-nvcxx` - Targets take the format of `ccXY` where `XY` stands for the compute capability of the device.
 
 For the following compilation flows, targets must be specified:
+
 * `cuda.*` - The target format is defined by clang and takes the format of `sm_XY`. For example:
-  * `sm_52`: NVIDIA Maxwell GPUs
-  * `sm_60`: NVIDIA Pascal GPUs
-  * `sm_70`: NVIDIA Volta GPUs
+    * `sm_52`: NVIDIA Maxwell GPUs
+    * `sm_60`: NVIDIA Pascal GPUs
+    * `sm_70`: NVIDIA Volta GPUs
 * `hip.*` - The target format is defined by clang and takes the format of `gfxXYZ`. For example:
-  * `gfx900`: AMD Vega 10 GPUs (e.g. Radeon Vega 56, Vega 64)
-  * `gfx906`: AMD Vega 20 GPUs (e.g. Radeon VII, Instinct MI50)
-  * `gfx908`: AMD CDNA GPUs (e.g Instinct MI100)
+    * `gfx900`: AMD Vega 10 GPUs (e.g. Radeon Vega 56, Vega 64)
+    * `gfx906`: AMD Vega 20 GPUs (e.g. Radeon VII, Instinct MI50)
+    * `gfx908`: AMD CDNA GPUs (e.g Instinct MI100)
 
 ### Abbreviations
 
 For some compilation flows, abbreviations exist that will be resolved by AdaptiveCpp to one of the available compilation flows:
+
 * `omp` will be translated 
-  * into `omp.accelerated` 
-     * if AdaptiveCpp has been built with support for accelerated CPU and the host compiler is the clang that AdaptiveCpp has been built with or
-     * if `--acpp-use-accelerated-cpu` is set. If the accelerated CPU compilation flow is not available (e.g. AdaptiveCpp has been compiled without support for it), compilation will abort with an error.
-  * into `omp.library-only` otherwise
+    * into `omp.accelerated` 
+        * if AdaptiveCpp has been built with support for accelerated CPU and the host compiler is the clang that AdaptiveCpp has been built with or
+        * if `--acpp-use-accelerated-cpu` is set. If the accelerated CPU compilation flow is not available (e.g. AdaptiveCpp has been compiled without support for it), compilation will abort with an error.
+    * into `omp.library-only` otherwise
 * `cuda` will be translated
-  * into `cuda.explicit-multipass`
-    * if another integrated multipass has been requested, or another backend that would conflict with `cuda.integrated-multipass`. AdaptiveCpp will emit a warning in this case, since switching to explicit multipass can change interoperability guarantees (see the [compilation](compilation.md) documentation).
-    * if `--acpp-explicit-multipass` is set explicitly
-  * into `cuda.integrated-multipass` otherwise
+    * into `cuda.explicit-multipass`
+        * if another integrated multipass has been requested, or another backend that would conflict with `cuda.integrated-multipass`. AdaptiveCpp will emit a warning in this case, since switching to explicit multipass can change interoperability guarantees (see the [compilation](compilation.md) documentation).
+        * if `--acpp-explicit-multipass` is set explicitly
+    * into `cuda.integrated-multipass` otherwise
 * `hip` will be translated into `hip.integrated-multipass`
 
 Of course, the desired flows can also always be specified explicitly.

From f2f9dd834637486e65b876339fb5e40af7e31199 Mon Sep 17 00:00:00 2001
From: Nils Friess <nils.friess@gmail.com>
Date: Wed, 31 Jul 2024 17:43:50 +0200
Subject: [PATCH 005/126] Call llvm::StringRef::startswith/starts_with
 depending on LLVM version

---
 include/hipSYCL/compiler/utils/LLVMUtils.hpp  | 21 ++++++++++++++++++-
 src/compiler/cbs/LoopSplitterInlining.cpp     |  4 +++-
 src/compiler/cbs/SubCfgFormation.cpp          |  5 +++--
 .../AddressSpaceInferencePass.cpp             |  5 +++--
 .../llvm-to-backend/LLVMToBackend.cpp         |  5 +++--
 .../llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp   |  3 ++-
 .../llvm-to-backend/spirv/LLVMToSpirv.cpp     |  5 +++--
 src/compiler/sscp/TargetSeparationPass.cpp    |  3 ++-
 src/compiler/stdpar/MallocToUSM.cpp           |  7 +++++--
 src/compiler/stdpar/SyncElision.cpp           |  6 +++---
 10 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/include/hipSYCL/compiler/utils/LLVMUtils.hpp b/include/hipSYCL/compiler/utils/LLVMUtils.hpp
index 9cc28ab4e..4e1fc02cb 100644
--- a/include/hipSYCL/compiler/utils/LLVMUtils.hpp
+++ b/include/hipSYCL/compiler/utils/LLVMUtils.hpp
@@ -11,7 +11,7 @@
 #ifndef HIPSYCL_LLVMUTILS_HPP
 #define HIPSYCL_LLVMUTILS_HPP
 
-
+#include <llvm/ADT/StringRef.h>
 #if LLVM_VERSION_MAJOR < 16
 #define IS_OPAQUE(pointer) (pointer->isOpaquePointerTy())
 #define HAS_TYPED_PTR 1
@@ -20,4 +20,23 @@
 #define HAS_TYPED_PTR 0
 #endif
 
+namespace hipsycl::llvmutils {
+
+  inline bool starts_with(llvm::StringRef String, llvm::StringRef Prefix) {
+#if LLVM_VERSION_MAJOR < 18
+    return String.startswith(Prefix);
+#else
+    return String.starts_with(Prefix);
+#endif
+  }
+
+  inline bool ends_with(llvm::StringRef String, llvm::StringRef Prefix) {
+#if LLVM_VERSION_MAJOR < 18
+    return String.endswith(Prefix);
+#else
+    return String.ends_with(Prefix);
+#endif
+  }
+}// namespace hipsycl::llvmutils
+
 #endif // HIPSYCL_LLVMUTILS_HPP
diff --git a/src/compiler/cbs/LoopSplitterInlining.cpp b/src/compiler/cbs/LoopSplitterInlining.cpp
index 2e27d141f..bcef6bb15 100644
--- a/src/compiler/cbs/LoopSplitterInlining.cpp
+++ b/src/compiler/cbs/LoopSplitterInlining.cpp
@@ -11,6 +11,7 @@
 #include "hipSYCL/compiler/cbs/LoopSplitterInlining.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
 #include "hipSYCL/compiler/cbs/SplitterAnnotationAnalysis.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 #include "hipSYCL/common/debug.hpp"
 
@@ -119,7 +120,8 @@ bool fillTransitiveSplitterCallers(llvm::Function &F,
   std::transform(F.begin(), F.end(), std::back_inserter(Blocks), [](auto &BB) { return &BB; });
 
   if (fillTransitiveSplitterCallers(Blocks, SAA, FuncsWSplitter,
-                                    InIntrinsic || F.getName().startswith("__acpp_sscp"))) {
+                                    InIntrinsic ||
+				    hipsycl::llvmutils::starts_with(F.getName(), "__acpp_sscp"))) {
     FuncsWSplitter.insert(&F);
     return true;
   }
diff --git a/src/compiler/cbs/SubCfgFormation.cpp b/src/compiler/cbs/SubCfgFormation.cpp
index b04ad82c1..9b73472c8 100644
--- a/src/compiler/cbs/SubCfgFormation.cpp
+++ b/src/compiler/cbs/SubCfgFormation.cpp
@@ -127,7 +127,7 @@ getLocalSizeArgumentFromAnnotation(llvm::Function &F) {
   for (auto &BB : F)
     for (auto &I : BB)
       if (auto *UI = llvm::dyn_cast<llvm::CallInst>(&I))
-        if (UI->getCalledFunction()->getName().startswith("llvm.var.annotation")) {
+        if (hipsycl::llvmutils::starts_with(UI->getCalledFunction()->getName(), "llvm.var.annotation")) {
           HIPSYCL_DEBUG_INFO << *UI << '\n';
           llvm::GlobalVariable *AnnotateStr = nullptr;
           if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(UI->getOperand(1));
@@ -142,7 +142,8 @@ getLocalSizeArgumentFromAnnotation(llvm::Function &F) {
             if (auto *Data =
                     llvm::dyn_cast<llvm::ConstantDataSequential>(AnnotateStr->getInitializer())) {
               if (Data->isString() &&
-                  Data->getAsString().startswith("hipsycl_nd_kernel_local_size_arg")) {
+		  hipsycl::llvmutils::starts_with(Data->getAsString(),
+						  "hipsycl_nd_kernel_local_size_arg")) {
                 if (auto *BC = llvm::dyn_cast<llvm::BitCastInst>(UI->getOperand(0)))
                   return {BC->getOperand(0), UI};
                 return {UI->getOperand(0), UI};
diff --git a/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp b/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp
index bd8873c3e..16e163fd4 100644
--- a/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp
+++ b/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp
@@ -30,6 +30,7 @@
 #include "hipSYCL/common/debug.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceInferencePass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceMap.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 namespace hipsycl {
 namespace compiler {
@@ -132,10 +133,10 @@ llvm::PreservedAnalyses AddressSpaceInferencePass::run(llvm::Module &M,
             forEachUseOfPointerValue(AI, [&](llvm::Value* U){
               if(auto* CB = llvm::dyn_cast<llvm::CallBase>(U)) {
                 llvm::StringRef CalleeName = CB->getCalledFunction()->getName();
-                if(CalleeName.startswith("llvm.lifetime")) {
+                if(llvmutils::starts_with(CalleeName,"llvm.lifetime")) {
                   InstsToRemove.push_back(CB);
 
-                  llvm::Intrinsic::ID Id = CalleeName.startswith("llvm.lifetime.start")
+                  llvm::Intrinsic::ID Id = llvmutils::starts_with(CalleeName, "llvm.lifetime.start")
                                                ? llvm::Intrinsic::lifetime_start
                                                : llvm::Intrinsic::lifetime_end;
 
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index 1731eba24..349aedf19 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -19,6 +19,7 @@
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
 #include "hipSYCL/compiler/sscp/KernelOutliningPass.hpp"
 #include "hipSYCL/compiler/utils/ProcessFunctionAnnotationsPass.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 #include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
 
 #include <cstdint>
@@ -99,8 +100,8 @@ class InstructionCleanupPass : public llvm::PassInfoMixin<InstructionCleanupPass
             // these instructions can sometimes appear as a byproduct of some transformations
             // even without dynamic allocas, but they are generally unsupported on device
             // backends.
-            if (CB->getCalledFunction()->getName().startswith("llvm.stacksave") ||
-                CB->getCalledFunction()->getName().startswith("llvm.stackrestore"))
+            if (llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.stacksave") ||
+		llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.stackrestore"))
               CallsToRemove.push_back(CB);
           }
         }
diff --git a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
index 6be898ae6..c116e1881 100644
--- a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
+++ b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
@@ -12,6 +12,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceInferencePass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 #include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
@@ -153,7 +154,7 @@ class RocmDeviceLibs {
       Invocation.push_back("-fno-hip-fp32-correctly-rounded-divide-sqrt");
     }
     
-    if(!llvm::StringRef{ClangPath}.endswith("hipcc")) {
+    if(!llvmutils::ends_with(llvm::StringRef{ClangPath}, "hipcc")) {
       // Normally we try to use hipcc. However, when that fails,
       // we may have fallen back to clang. In that case we may
       // have to additionally set --rocm-path and --rocm-device-lib-path.
diff --git a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
index c41c44498..0b6ebbe4a 100644
--- a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
+++ b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
@@ -14,6 +14,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 #include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
@@ -193,8 +194,8 @@ bool LLVMToSpirvTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
           // llvm-spirv translator does not like llvm.lifetime.start/end operate on generic
           // pointers.
           auto* CalledF = CB->getCalledFunction();
-          if (CalledF->getName().startswith("llvm.lifetime.start") ||
-              CalledF->getName().startswith("llvm.lifetime.end")) {
+          if (llvmutils::starts_with(CalledF->getName(), "llvm.lifetime.start") ||
+	      llvmutils::starts_with(CalledF->getName(), "llvm.lifetime.end")) {
             if(CB->getNumOperands() > 1 && CB->getArgOperand(1)->getType()->isPointerTy())
               if (CB->getArgOperand(1)->getType()->getPointerAddressSpace() ==
                   ASMap[AddressSpace::Generic])
diff --git a/src/compiler/sscp/TargetSeparationPass.cpp b/src/compiler/sscp/TargetSeparationPass.cpp
index 7bf0522c1..9c9a735e3 100644
--- a/src/compiler/sscp/TargetSeparationPass.cpp
+++ b/src/compiler/sscp/TargetSeparationPass.cpp
@@ -19,6 +19,7 @@
 #include "hipSYCL/compiler/CompilationState.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
 #include "hipSYCL/compiler/utils/ProcessFunctionAnnotationsPass.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 #include "hipSYCL/common/hcf_container.hpp"
 
 #include <cstddef>
@@ -317,7 +318,7 @@ std::unique_ptr<llvm::Module> generateDeviceIR(llvm::Module &M,
       // if they are not defined, not an intrinsic and don't start with
       // __ like our hipSYCL builtins. This is a hack, it would
       // be better if we could tell clang to annotate the declaration for us :(
-      if(!F.isIntrinsic() && !F.getName().startswith("__"))
+      if(!F.isIntrinsic() && !llvmutils::starts_with(F.getName(), "__"))
         ImportedSymbolsOutput.push_back(F.getName().str());
     }
   }
diff --git a/src/compiler/stdpar/MallocToUSM.cpp b/src/compiler/stdpar/MallocToUSM.cpp
index e5cb2c47f..d2bae051b 100644
--- a/src/compiler/stdpar/MallocToUSM.cpp
+++ b/src/compiler/stdpar/MallocToUSM.cpp
@@ -10,6 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/stdpar/MallocToUSM.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 
 
@@ -71,7 +72,7 @@ bool NameStartsWithItaniumIdentifier(llvm::StringRef Name, llvm::StringRef Ident
 
 bool isRestrictedToRegularMalloc(llvm::Function* F) {
   llvm::StringRef Name = F->getName();
-  if(!Name.startswith("_Z"))
+  if(!llvmutils::starts_with(Name, "_Z"))
     return false;
   
   if(NameStartsWithItaniumIdentifier(Name, "hipsycl"))
@@ -82,7 +83,9 @@ bool isRestrictedToRegularMalloc(llvm::Function* F) {
 
 bool isStdFunction(llvm::Function* F) {
   llvm::StringRef Name = F->getName();
-  if(Name.startswith("_ZNSt") || Name.startswith("_ZSt") || Name.startswith("_ZNKSt"))
+  if(llvmutils::starts_with(Name, "_ZNSt") ||
+     llvmutils::starts_with(Name, "_ZSt") ||
+     llvmutils::starts_with(Name, "_ZNKSt"))
     return true;
   return false;
 }
diff --git a/src/compiler/stdpar/SyncElision.cpp b/src/compiler/stdpar/SyncElision.cpp
index 506955e1d..e106fb204 100644
--- a/src/compiler/stdpar/SyncElision.cpp
+++ b/src/compiler/stdpar/SyncElision.cpp
@@ -10,7 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/stdpar/SyncElision.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
-
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 #include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/IR/BasicBlock.h>
@@ -82,7 +82,7 @@ void identifyStoresPotentiallyForStdparArgHandling(
                 if (StdparFunctions.contains(CB->getCalledFunction())) {
                   Users.push_back(Current);
                   return true;
-                } else if(CB->getCalledFunction()->getName().startswith("llvm.lifetime")) {
+                } else if(llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.lifetime")) {
                   return true;
                 }
               }
@@ -134,7 +134,7 @@ bool functionDoesNotAccessMemory(llvm::Function* F){
   if(!F)
     return true;
   if(F->isIntrinsic()) {
-    if(F->getName().startswith("llvm.lifetime")){
+    if(llvmutils::starts_with(F->getName(), "llvm.lifetime")){
       return true;
     }
   }

From 9e7d291cb843596101cddd6804466eeeaf3fa03e Mon Sep 17 00:00:00 2001
From: Nils Friess <nils.friess@gmail.com>
Date: Wed, 31 Jul 2024 18:25:57 +0200
Subject: [PATCH 006/126] Run with LLVM 18 in CI

---
 .github/workflows/linux-lit.yml | 4 ++--
 .github/workflows/linux.yml     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linux-lit.yml b/.github/workflows/linux-lit.yml
index b091907da..73ba8b7f4 100644
--- a/.github/workflows/linux-lit.yml
+++ b/.github/workflows/linux-lit.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        clang: [14, 15, 16, 17]
+        clang: [14, 15, 16, 17, 18]
         os: [ubuntu-22.04]
     steps:
       - uses: actions/checkout@v4
@@ -71,7 +71,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        clang: [14, 15, 16, 17]
+        clang: [14, 15, 16, 17, 18]
         os: [ubuntu-22.04]
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index f0da74ceb..edd18a837 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        clang: [15, 16]
+        clang: [15, 16, 17, 18]
         os: [ubuntu-22.04]
         cuda: [11.0.2]
         rocm: [5.4.3]

From eee92e82d19d59d655da50051346f17dfaa7e197 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 31 Jul 2024 22:08:57 +0200
Subject: [PATCH 007/126] Account for driver not liking some PTX instructions

---
 src/libkernel/sscp/ptx/atomic.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/libkernel/sscp/ptx/atomic.cpp b/src/libkernel/sscp/ptx/atomic.cpp
index 66c265367..f8d1f627f 100644
--- a/src/libkernel/sscp/ptx/atomic.cpp
+++ b/src/libkernel/sscp/ptx/atomic.cpp
@@ -457,7 +457,7 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i32(
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr, __acpp_int32 x) {
   if(scope == __acpp_sscp_memory_scope::system) {
     if(order == __acpp_sscp_memory_order::release) {
-      asm volatile("st.release.generic.i32 [%0], %1;" 
+      asm volatile("st.release.sys.s32 [%0], %1;"
                    :
                    :"l"(ptr), "r"(x)
                    : "memory");
@@ -497,7 +497,7 @@ HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_load_i32(
   if(scope == __acpp_sscp_memory_scope::system) {
     if(order == __acpp_sscp_memory_order::acquire) {
       __acpp_int32 result;
-      asm volatile("ld.acquire.generic.sys.i32 %0,[%1];"
+      asm volatile("ld.acquire.sys.u32 %0,[%1];"
                    : "=r"(result)
                    : "l"(ptr)
                    : "memory");
@@ -647,7 +647,10 @@ HIPSYCL_SSCP_BUILTIN bool __acpp_sscp_cmp_exch_strong_i32(
         failure == __acpp_sscp_memory_order::acquire) {
       __acpp_int32 compare = *expected;
       __acpp_int32 result;
-      asm volatile("atom.acquire.cas.generic.sys.i32 %0,[%1],%2,%3;"
+      // Documentation says u32/s32 types should be allowed,
+      // but driver currently does not accept this. So use b32
+      // instead.
+      asm volatile("atom.acquire.sys.cas.b32 %0,[%1],%2,%3;"
                    : "=r"(result)
                    : "l"(ptr), "r"(compare), "r"(desired)
                    : "memory");
@@ -858,7 +861,7 @@ HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_fetch_add_i32(
   if (scope == __acpp_sscp_memory_scope::system) {
     if(order == __acpp_sscp_memory_order::acq_rel) {
       __acpp_int32 result;
-      asm volatile("atom.add.acq_rel.sys.i32 %0,[%1],%2;"
+      asm volatile("atom.add.acq_rel.sys.s32 %0,[%1],%2;"
                           : "=r"(result)
                           : "l"(ptr), "r"(x)
                           : "memory");

From aaeaecce9979b448a7d5386b45c100e1f8a5deb5 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 1 Aug 2024 16:14:34 +0200
Subject: [PATCH 008/126] Improve libLLVM detection by also looking for
 libLLVM-<version>

---
 src/compiler/llvm-to-backend/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/llvm-to-backend/CMakeLists.txt b/src/compiler/llvm-to-backend/CMakeLists.txt
index 9ccebee24..b3b5a9612 100644
--- a/src/compiler/llvm-to-backend/CMakeLists.txt
+++ b/src/compiler/llvm-to-backend/CMakeLists.txt
@@ -28,7 +28,7 @@ function(create_llvm_based_library)
   target_include_directories(${target} SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
   
   target_compile_definitions(${target} PRIVATE ${LLVM_DEFINITIONS} -DHIPSYCL_COMPILER_COMPONENT)
-  find_library(LLVM_LIBRARY NAMES LLVM HINTS ${LLVM_LIBRARY_DIRS} NO_DEFAULT_PATH)
+  find_library(LLVM_LIBRARY NAMES LLVM LLVM-${LLVM_VERSION_MAJOR} HINTS ${LLVM_LIBRARY_DIRS} NO_DEFAULT_PATH)
   if(NOT LLVM_LIBRARY)
     message(FATAL_ERROR "LLVM at ${LLVM_DIR} does not have libLLVM.so. Please disable SSCP and related backends (-DWITH_SSCP_COMPILER=OFF -DWITH_OPENCL_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF) or choose another LLVM installation")
   endif()

From ecb9d1362cb7b3e08aff0c8ea387ad50edb74d58 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 2 Aug 2024 19:45:34 +0200
Subject: [PATCH 009/126] [stdpar] Add simple bitonic sort

---
 include/hipSYCL/algorithms/algorithm.hpp      |  10 ++
 .../hipSYCL/algorithms/sort/bitonic_sort.hpp  | 128 ++++++++++++++++++
 .../stdpar/detail/offload_heuristic_db.hpp    |   2 +
 .../std/stdpar/pstl-impl/algorithm.hpp        |  72 ++++++++++
 tests/CMakeLists.txt                          |   1 +
 tests/pstl/sort.cpp                           |  57 ++++++++
 6 files changed, 270 insertions(+)
 create mode 100644 include/hipSYCL/algorithms/sort/bitonic_sort.hpp
 create mode 100644 tests/pstl/sort.cpp

diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index dfd4a6dee..1f6633dcd 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -24,6 +24,7 @@
 #include "util/traits.hpp"
 #include "hipSYCL/algorithms/util/allocation_cache.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
+#include "hipSYCL/algorithms/sort/bitonic_sort.hpp"
 
 namespace hipsycl::algorithms {
 
@@ -452,6 +453,15 @@ sycl::event none_of(sycl::queue &q,
   });
 }
 
+template <class RandomIt, class Compare>
+void sort(sycl::queue &q, RandomIt first, RandomIt last,
+          Compare comp = std::less<>{}) {
+  std::size_t problem_size = std::distance(first, last);
+  if(problem_size == 0)
+    return sycl::event{};
+  
+  return sorting::bitonic_sort(q, first, last, comp);
+}
 }
 
 #endif
diff --git a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
new file mode 100644
index 000000000..b6d7f900b
--- /dev/null
+++ b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
@@ -0,0 +1,128 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_BITONIC_SORT
+#define ACPP_ALGORITHMS_BITONIC_SORT
+
+#include <iterator>
+#include "hipSYCL/sycl/queue.hpp"
+
+namespace hipsycl::algorithms::sorting {
+
+
+// This function is based on the code from SyclParallelSTL, and subject
+// to the following copyright and license:
+/*
+Copyright (c) 2015-2018 The Khronos Group Inc.
+
+   Permission is hereby granted, free of charge, to any person obtaining a
+   copy of this software and/or associated documentation files (the
+   "Materials"), to deal in the Materials without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Materials, and to
+   permit persons to whom the Materials are furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Materials.
+
+   MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+   KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+   SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+    https://www.khronos.org/registry/
+
+  THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+  */
+/* bitonic_sort.
+ * Performs a bitonic sort on the given buffer
+ */
+template <class RandomIt, class Comparator>
+sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
+                         Comparator comp) {
+
+  std::size_t num_elements = std::distance(first, last);
+
+  int num_stages = 0;
+  // 2^numStages should be equal to length
+  // i.e number of times you halve the lenght to get 1 should be numStages
+  for (int tmp = num_elements; tmp > 1; tmp >>= 1) {
+    ++num_stages;
+  }
+
+  sycl::event most_recent_event;
+  sycl::range<1> r{num_elements / 2};
+
+  using T = typename std::iterator_traits<RandomIt>::value_type;
+
+  for (int stage = 0; stage < num_stages; ++stage) {
+    // Every stage has stage + 1 passes
+    for (int pass = 0; pass < stage + 1; ++pass) {
+
+      auto advance_to = [](RandomIt first, auto i) -> RandomIt {
+        std::advance(first, i);
+        return first;
+      };
+
+      auto kernel = [=](sycl::id<1> idx) {
+
+        int sort_increasing = 1;
+        
+        std::size_t gid = idx.get(0);
+
+        int pair_distance = 1 << (stage - pass);
+        int block_width = 2 * pair_distance;
+
+        std::size_t left_id =
+            (gid % pair_distance) + (gid / pair_distance) * block_width;
+        std::size_t right_id = left_id + pair_distance;
+
+        T left_element = *advance_to(first, left_id);
+        T right_element = *advance_to(first, right_id);
+
+        std::size_t same_direction_block_width = 1 << stage;
+
+        if ((gid / same_direction_block_width) % 2 == 1) {
+          sort_increasing = 1 - sort_increasing;
+        }
+
+        T greater = left_element;
+        T lesser = right_element;
+
+        if (comp(left_element, right_element)) {
+          greater = right_element;
+          lesser = left_element;
+        } else {
+          greater = left_element;
+          lesser = right_element;
+        }
+
+        *advance_to(first, left_id) = sort_increasing ? lesser : greater;
+        *advance_to(first, right_id) = sort_increasing ? greater : lesser;
+      };
+
+      if((stage == 0 && pass == 0) || q.is_in_order())
+        most_recent_event = q.parallel_for(r, kernel);
+      else
+        most_recent_event = q.parallel_for(r, most_recent_event, kernel);
+
+    } // pass_stage
+  }   // stage
+  return most_recent_event;
+} // bitonic_sort
+}
+
+#endif
diff --git a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
index b2bc1d8a2..9c65ae2e6 100644
--- a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
+++ b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
@@ -50,6 +50,8 @@ struct find_if_not {};
 struct all_of {};
 struct any_of {};
 struct none_of {};
+struct sort {};
+
 
 struct transform_reduce {};
 struct reduce {};
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
index 4eea4ca21..9e59c815d 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
@@ -487,6 +487,45 @@ bool none_of(hipsycl::stdpar::par_unseq, ForwardIt first, ForwardIt last,
 
 
 
+template <class RandomIt>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par_unseq, RandomIt first,
+                                        RandomIt last) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last);
+  };
+
+  auto fallback = [&](){
+    std::sort(hipsycl::stdpar::par_unseq_host_fallback, first, last);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last));
+}
+
+
+template <class RandomIt, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par_unseq, RandomIt first,
+                                        RandomIt last, Compare comp) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last, comp);
+  };
+
+  auto fallback = [&]() {
+    std::sort(hipsycl::stdpar::par_unseq_host_fallback, first, last, comp);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), comp);
+}
+
 
 
 //////////////////// par policy  /////////////////////////////////////
@@ -951,10 +990,43 @@ bool none_of(hipsycl::stdpar::par, ForwardIt first, ForwardIt last,
       HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), p);
 }
 
+template <class RandomIt>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par, RandomIt first,
+                                        RandomIt last) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last);
+  };
 
+  auto fallback = [&](){
+    std::sort(hipsycl::stdpar::par_host_fallback, first, last);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last));
+}
 
+template <class RandomIt, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par, RandomIt first,
+                                    RandomIt last, Compare comp) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last, comp);
+  };
 
+  auto fallback = [&]() {
+    std::sort(hipsycl::stdpar::par_host_fallback, first, last, comp);
+  };
 
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), comp);
+}
 }
 
 #endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 64f58ea27..cbcbd1771 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -147,6 +147,7 @@ if(WITH_PSTL_TESTS)
     pstl/replace_if.cpp
     pstl/replace_copy.cpp
     pstl/replace_copy_if.cpp
+    pstl/sort.cpp
     pstl/transform.cpp
     pstl/transform_reduce.cpp
     pstl/pointer_validation.cpp
diff --git a/tests/pstl/sort.cpp b/tests/pstl/sort.cpp
new file mode 100644
index 000000000..49d2c1da1
--- /dev/null
+++ b/tests/pstl/sort.cpp
@@ -0,0 +1,57 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <algorithm>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <functional>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/mpl/list.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_sort, enable_unified_shared_memory)
+
+template <class Policy, class Generator, class Comp = std::less<>>
+void test_sort(Policy &&pol, std::size_t problem_size, Generator gen,
+               Comp comp = {}) {
+  std::vector<int> data(problem_size);
+  for(int i = 0; i < problem_size; ++i)
+    data[i] = gen(i);
+  std::vector<int> host_data = data;
+
+  std::sort(pol, data.begin(), data.end(), comp);
+  
+  BOOST_CHECK(std::is_sorted(data.begin(), data.end(), comp));
+  std::sort(host_data.begin(), host_data.end(), comp);
+  BOOST_CHECK(host_data == data);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  test_sort(std::execution::par_unseq, 0, [](int i){return 0;});
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  test_sort(std::execution::par_unseq, 1, [](int i){return i;});
+}
+
+
+BOOST_AUTO_TEST_CASE(par_unseq_pow2_descending) {
+  test_sort(std::execution::par_unseq, 1024, [](int i){return -i;});
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_pow2_ascending) {
+  test_sort(std::execution::par_unseq, 1024, [](int i){return i;});
+}
+
+BOOST_AUTO_TEST_SUITE_END()

From 7687eb42a3171bdb97d8fdb989228716b5ac4d40 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sun, 4 Aug 2024 03:16:44 +0200
Subject: [PATCH 010/126] [stdpar] generalize sort for non-power of two
 problems

---
 .../hipSYCL/algorithms/sort/bitonic_sort.hpp  | 140 ++++++------------
 tests/pstl/sort.cpp                           |   8 +
 2 files changed, 55 insertions(+), 93 deletions(-)

diff --git a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
index b6d7f900b..03aaf83be 100644
--- a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
+++ b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
@@ -13,116 +13,70 @@
 #define ACPP_ALGORITHMS_BITONIC_SORT
 
 #include <iterator>
+#include <cstdint>
 #include "hipSYCL/sycl/queue.hpp"
 
 namespace hipsycl::algorithms::sorting {
 
+namespace detail{
 
-// This function is based on the code from SyclParallelSTL, and subject
-// to the following copyright and license:
-/*
-Copyright (c) 2015-2018 The Khronos Group Inc.
-
-   Permission is hereby granted, free of charge, to any person obtaining a
-   copy of this software and/or associated documentation files (the
-   "Materials"), to deal in the Materials without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Materials, and to
-   permit persons to whom the Materials are furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Materials.
-
-   MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
-   KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
-   SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
-    https://www.khronos.org/registry/
-
-  THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-  MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
-  */
-/* bitonic_sort.
- * Performs a bitonic sort on the given buffer
- */
-template <class RandomIt, class Comparator>
-sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
-                         Comparator comp) {
-
-  std::size_t num_elements = std::distance(first, last);
-
-  int num_stages = 0;
-  // 2^numStages should be equal to length
-  // i.e number of times you halve the lenght to get 1 should be numStages
-  for (int tmp = num_elements; tmp > 1; tmp >>= 1) {
-    ++num_stages;
-  }
-
-  sycl::event most_recent_event;
-  sycl::range<1> r{num_elements / 2};
-
-  using T = typename std::iterator_traits<RandomIt>::value_type;
-
-  for (int stage = 0; stage < num_stages; ++stage) {
-    // Every stage has stage + 1 passes
-    for (int pass = 0; pass < stage + 1; ++pass) {
-
-      auto advance_to = [](RandomIt first, auto i) -> RandomIt {
-        std::advance(first, i);
-        return first;
-      };
 
-      auto kernel = [=](sycl::id<1> idx) {
-
-        int sort_increasing = 1;
-        
-        std::size_t gid = idx.get(0);
+template<class RandomIt, class Size>
+RandomIt advance_to(RandomIt first, Size i) {
+  std::advance(first, i);
+  return first;
+}
 
-        int pair_distance = 1 << (stage - pass);
-        int block_width = 2 * pair_distance;
+inline bool can_compare(std::size_t left_id, std::size_t right_id,
+                        std::size_t problem_size) {
 
-        std::size_t left_id =
-            (gid % pair_distance) + (gid / pair_distance) * block_width;
-        std::size_t right_id = left_id + pair_distance;
+  return (left_id < right_id) && (left_id < problem_size) &&
+         (right_id < problem_size);
+}
 
-        T left_element = *advance_to(first, left_id);
-        T right_element = *advance_to(first, right_id);
 
-        std::size_t same_direction_block_width = 1 << stage;
+} //detail
 
-        if ((gid / same_direction_block_width) % 2 == 1) {
-          sort_increasing = 1 - sort_increasing;
-        }
 
-        T greater = left_element;
-        T lesser = right_element;
+template <class RandomIt, class Comparator>
+sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
+                         Comparator comp) {
 
-        if (comp(left_element, right_element)) {
-          greater = right_element;
-          lesser = left_element;
-        } else {
-          greater = left_element;
-          lesser = right_element;
+  std::size_t problem_size = std::distance(first, last);
+  sycl::event most_recent_event;
+  bool is_first_kernel = true;
+
+  auto launch_kernel = [&](std::size_t j){
+
+    auto k = [=](sycl::id<1> idx) {
+      std::size_t a_id = idx.get(0);
+      std::size_t b_id = a_id ^ j;
+      if(detail::can_compare(a_id, b_id, problem_size)) {
+        auto a = *detail::advance_to(first, a_id);
+        auto b = *detail::advance_to(first, b_id);
+        if(comp(b, a)) {
+          *detail::advance_to(first, a_id) = b;
+          *detail::advance_to(first, b_id) = a;
         }
+      }
+    };
+    if(is_first_kernel || q.is_in_order())
+      most_recent_event = q.parallel_for(problem_size, k);
+    else
+      most_recent_event = q.parallel_for(problem_size, most_recent_event, k);
+  };
+
+  for (std::size_t k = 2; (k >> 1) < problem_size; k *= 2) {
+    launch_kernel(k-1);
+
+    for (std::size_t j = k >> 1; j > 0; j >>= 1) {
+      launch_kernel(j);
+    }
+  }
 
-        *advance_to(first, left_id) = sort_increasing ? lesser : greater;
-        *advance_to(first, right_id) = sort_increasing ? greater : lesser;
-      };
-
-      if((stage == 0 && pass == 0) || q.is_in_order())
-        most_recent_event = q.parallel_for(r, kernel);
-      else
-        most_recent_event = q.parallel_for(r, most_recent_event, kernel);
-
-    } // pass_stage
-  }   // stage
   return most_recent_event;
 } // bitonic_sort
+
 }
 
 #endif
diff --git a/tests/pstl/sort.cpp b/tests/pstl/sort.cpp
index 49d2c1da1..4fb4f7b66 100644
--- a/tests/pstl/sort.cpp
+++ b/tests/pstl/sort.cpp
@@ -54,4 +54,12 @@ BOOST_AUTO_TEST_CASE(par_unseq_pow2_ascending) {
   test_sort(std::execution::par_unseq, 1024, [](int i){return i;});
 }
 
+BOOST_AUTO_TEST_CASE(par_unseq_non_pow2_descending) {
+  test_sort(std::execution::par_unseq, 1000, [](int i){return -i;});
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_non_pow2_ascending) {
+  test_sort(std::execution::par_unseq, 1000, [](int i){return i;});
+}
+
 BOOST_AUTO_TEST_SUITE_END()

From 674b556b7d85c2eb01ea0acbb208197fd0e5a07d Mon Sep 17 00:00:00 2001
From: wolfwood <transmethyl@gmail.com>
Date: Mon, 5 Aug 2024 16:02:06 -0700
Subject: [PATCH 011/126] properly handle LLVM_DEFINITIONS with assigned values

---
 src/compiler/CMakeLists.txt                 |  5 ++++-
 src/compiler/llvm-to-backend/CMakeLists.txt | 14 ++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/compiler/CMakeLists.txt b/src/compiler/CMakeLists.txt
index 96eb97f29..b60b7a05c 100644
--- a/src/compiler/CMakeLists.txt
+++ b/src/compiler/CMakeLists.txt
@@ -102,8 +102,11 @@ function(configure_target)
     ${CMAKE_BINARY_DIR}/include)
   target_include_directories(${target} SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
 
+  separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
   target_compile_definitions(${target} PRIVATE
-    ${LLVM_DEFINITIONS} -DHIPSYCL_COMPILER_COMPONENT)
+    ${LLVM_DEFINITIONS_LIST})
+  target_compile_definitions(${target} PRIVATE
+    -DHIPSYCL_COMPILER_COMPONENT)
 
   if(ROCM_VERSION_MAJOR)
     target_compile_definitions(${target} PRIVATE -DROCM_CLANG_VERSION_MAJOR=${ROCM_VERSION_MAJOR} -DROCM_CLANG_VERSION_MINOR=${ROCM_VERSION_MINOR} -DROCM_CLANG_VERSION_PATCH=${ROCM_VERSION_PATCH})
diff --git a/src/compiler/llvm-to-backend/CMakeLists.txt b/src/compiler/llvm-to-backend/CMakeLists.txt
index b3b5a9612..e5674efcd 100644
--- a/src/compiler/llvm-to-backend/CMakeLists.txt
+++ b/src/compiler/llvm-to-backend/CMakeLists.txt
@@ -26,8 +26,11 @@ function(create_llvm_based_library)
   target_include_directories(${target} PRIVATE
     ${LLVM_TO_BACKEND_INCLUDE_DIRS})
   target_include_directories(${target} SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
-  
-  target_compile_definitions(${target} PRIVATE ${LLVM_DEFINITIONS} -DHIPSYCL_COMPILER_COMPONENT)
+
+  separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
+  target_compile_definitions(${target} PRIVATE
+    ${LLVM_DEFINITIONS_LIST})
+  target_compile_definitions(${target} PRIVATE -DHIPSYCL_COMPILER_COMPONENT)
   find_library(LLVM_LIBRARY NAMES LLVM LLVM-${LLVM_VERSION_MAJOR} HINTS ${LLVM_LIBRARY_DIRS} NO_DEFAULT_PATH)
   if(NOT LLVM_LIBRARY)
     message(FATAL_ERROR "LLVM at ${LLVM_DIR} does not have libLLVM.so. Please disable SSCP and related backends (-DWITH_SSCP_COMPILER=OFF -DWITH_OPENCL_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF) or choose another LLVM installation")
@@ -76,8 +79,11 @@ function(create_llvm_to_backend_tool)
   target_include_directories(${target}-tool PRIVATE
     ${LLVM_TO_BACKEND_INCLUDE_DIRS})
   target_include_directories(${target}-tool SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
-  
-  target_compile_definitions(${target}-tool PRIVATE ${LLVM_DEFINITIONS} -DHIPSYCL_TOOL_COMPONENT)
+
+  separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
+  target_compile_definitions(${target}-tool PRIVATE
+    ${LLVM_DEFINITIONS_LIST})
+  target_compile_definitions(${target}-tool PRIVATE -DHIPSYCL_TOOL_COMPONENT)
   target_link_libraries(${target}-tool PRIVATE ${target})
 
   install(TARGETS ${target}-tool DESTINATION lib/hipSYCL/llvm-to-backend)

From 86de6f054b93013964af9b330096553b476ab50c Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 6 Aug 2024 21:12:47 +0200
Subject: [PATCH 012/126] [stdpar][doc] Add sort to list of supported stdpar
 algorithms

---
 doc/stdpar.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/stdpar.md b/doc/stdpar.md
index d422788da..a54683be4 100644
--- a/doc/stdpar.md
+++ b/doc/stdpar.md
@@ -42,6 +42,7 @@ Offloading is implemented for the following STL algorithms:
 |`any_of` | |
 |`all_of` | |
 |`none_of` | |
+|`sort` | |
 
 
 For all other execution policies or algorithms, the algorithm will compile and execute correctly, however the regular host implementation of the algorithm provided by the C++ standard library implementation will be invoked and no offloading takes place.

From 35a512f1e978d3412ad593f49cc31e24db0e28ee Mon Sep 17 00:00:00 2001
From: Kazuki Oikawa <k@oikw.org>
Date: Wed, 7 Aug 2024 20:34:55 +0900
Subject: [PATCH 013/126] [sscp/amdgpu] Fixed __atomic_work_item_fence is
 undefined in ROCm 6.2

---
 src/libkernel/sscp/amdgpu/barrier.cpp | 94 ++++++++++++++-------------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/src/libkernel/sscp/amdgpu/barrier.cpp b/src/libkernel/sscp/amdgpu/barrier.cpp
index 8f22c85a6..68e2a5737 100644
--- a/src/libkernel/sscp/amdgpu/barrier.cpp
+++ b/src/libkernel/sscp/amdgpu/barrier.cpp
@@ -10,62 +10,68 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/sycl/libkernel/sscp/builtins/barrier.hpp"
 
-enum amdgpu_memory_order {
-  relaxed = __ATOMIC_RELAXED,
-  acquire = __ATOMIC_ACQUIRE,
-  release = __ATOMIC_RELEASE,
-  acq_rel = __ATOMIC_ACQ_REL,
-  seq_cst = __ATOMIC_SEQ_CST
-};
-
-enum amdgpu_memory_scope {
-  work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
-  work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
-  device = __OPENCL_MEMORY_SCOPE_DEVICE,
-  all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-  sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
-};
-
-
-#define __CLK_LOCAL_MEM_FENCE    0x01
-
-
-extern "C" void
-__atomic_work_item_fence(unsigned mem_fence_flags, amdgpu_memory_order, amdgpu_memory_scope);
-
-__attribute__((always_inline)) amdgpu_memory_order
-__acpp_amdgpu_get_mem_order(__acpp_sscp_memory_order order) {
-  if(order == __acpp_sscp_memory_order::acq_rel)
-    return acq_rel;
-  else if(order == __acpp_sscp_memory_order::acquire)
-    return acquire;
-  else if(order == __acpp_sscp_memory_order::release)
-    return release;
-  else if(order == __acpp_sscp_memory_order::relaxed)
-    return relaxed;
-  else
-    return seq_cst;
-}
-
 __attribute__((always_inline))
 void __acpp_amdgpu_local_barrier() {
-  __atomic_work_item_fence(__CLK_LOCAL_MEM_FENCE, release, work_group);
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
   __builtin_amdgcn_s_barrier();
-  __atomic_work_item_fence(__CLK_LOCAL_MEM_FENCE, acquire, work_group);
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
 __attribute__((always_inline)) void
 __acpp_amdgpu_mem_fence(__acpp_sscp_memory_scope fence_scope,
                         __acpp_sscp_memory_order order) {
 
-  auto mem_order = __acpp_amdgpu_get_mem_order(order);
-
   if(fence_scope == __acpp_sscp_memory_scope::work_group) {
-    __atomic_work_item_fence(0, mem_order, work_group);
+    switch(order) {
+    case __acpp_sscp_memory_order::relaxed:
+      break;
+    case __acpp_sscp_memory_order::acquire:
+      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+      break;
+    case __acpp_sscp_memory_order::release:
+      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+      break;
+    case __acpp_sscp_memory_order::acq_rel:
+      __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup");
+      break;
+    case __acpp_sscp_memory_order::seq_cst:
+      __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+      break;
+    }
   } else if(fence_scope == __acpp_sscp_memory_scope::device) {
-    __atomic_work_item_fence(0, mem_order, device);
+    switch(order) {
+    case __acpp_sscp_memory_order::relaxed:
+      break;
+    case __acpp_sscp_memory_order::acquire:
+      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
+      break;
+    case __acpp_sscp_memory_order::release:
+      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent");
+      break;
+    case __acpp_sscp_memory_order::acq_rel:
+      __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
+      break;
+    case __acpp_sscp_memory_order::seq_cst:
+      __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
+      break;
+    }
   } else if(fence_scope == __acpp_sscp_memory_scope::system) {
-    __atomic_work_item_fence(0, mem_order, all_svm_devices);
+    switch(order) {
+    case __acpp_sscp_memory_order::relaxed:
+      break;
+    case __acpp_sscp_memory_order::acquire:
+      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "");
+      break;
+    case __acpp_sscp_memory_order::release:
+      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "");
+      break;
+    case __acpp_sscp_memory_order::acq_rel:
+      __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
+      break;
+    case __acpp_sscp_memory_order::seq_cst:
+      __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
+      break;
+    }
   }
 }
 

From 8a019910f7764369c28ffeb790be05507752c630 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 7 Aug 2024 17:05:42 +0200
Subject: [PATCH 014/126] Builting remapping: Also remap finite builtins

---
 src/compiler/sscp/StdBuiltinRemapperPass.cpp | 33 +++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/compiler/sscp/StdBuiltinRemapperPass.cpp b/src/compiler/sscp/StdBuiltinRemapperPass.cpp
index b36808902..9fd6b74c9 100644
--- a/src/compiler/sscp/StdBuiltinRemapperPass.cpp
+++ b/src/compiler/sscp/StdBuiltinRemapperPass.cpp
@@ -13,6 +13,10 @@
 #include <unordered_set>
 #include <unordered_map>
 
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+
 namespace hipsycl {
 namespace compiler {
 
@@ -54,7 +58,34 @@ using builtin_mapping = std::array<const char*, 2>;
 static constexpr std::array explicitly_mapped_builtins = {
   // clang sometimes (e.g. -ffast-math) these builtins
   builtin_mapping{"__powisf2", "__acpp_sscp_pown_f32"},
-  builtin_mapping{"__powidf2", "__acpp_sscp_pown_f64"}
+  builtin_mapping{"__powidf2", "__acpp_sscp_pown_f64"},
+
+#define ACPP_DECLARE_FINITE_BUILTIN_MAPPING(name) \
+  builtin_mapping{STRINGIFY(__ ## name ## f_finite), STRINGIFY(__acpp_sscp_ ## name ## _f32)}, \
+  builtin_mapping{STRINGIFY(__ ## name ## _finite), STRINGIFY(__acpp_sscp_ ## name ## _f64)}
+
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(acos),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(acosh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(asin),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(asinh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(atan2),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(atanh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(cosh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(cos),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(exp10),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(exp2),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(exp),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(fmod),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(log10),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(log2),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(log),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(hypot),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(pow),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(remainder),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(sinh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(sin),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(sqrt),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(tan)
 };
 
 llvm::PreservedAnalyses StdBuiltinRemapperPass::run(llvm::Module &M,

From 4ea3d118ef571519109e47e18dad8e435fd588ce Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 8 Aug 2024 23:43:44 +0200
Subject: [PATCH 015/126] Relax profiling tests to account for different time
 resolution of submission and start/end timestamps

---
 tests/sycl/profiler.cpp | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/sycl/profiler.cpp b/tests/sycl/profiler.cpp
index a34b719bf..4f2f08793 100644
--- a/tests/sycl/profiler.cpp
+++ b/tests/sycl/profiler.cpp
@@ -95,7 +95,14 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_submit>();
     auto t13 =
         evt1.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t11 <= t12 && t12 <= t13);
+    // We cannot test that submit time is <= command start time, since
+    // in some backends (e.g. CUDA) time is only measured as elapsed time
+    // between two points in low-precision float. Submission time on the other
+    // hand is always exact. So there might be rounding errors causing
+    // t12 > t11.
+    // The same thing could in principle happen when comparing submission time
+    // with command end time, but hopefully this is less likely.
+    BOOST_CHECK(t11 <= t13 && t12 <= t13);
 
     auto evt2 = queue.submit([&](cl::sycl::handler &cgh) {
       auto acc = buf1.get_access<cl::sycl::access::mode::discard_write>(cgh);
@@ -116,7 +123,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t23 =
         evt2.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t21 <= t22 && t22 <= t23);
+    BOOST_CHECK(t21 <= t23 && t22 <= t23);
 
     auto t31 = evt3.get_profiling_info<
         cl::sycl::info::event_profiling::command_submit>();
@@ -124,7 +131,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t33 =
         evt3.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t31 <= t32 && t32 <= t33);
+    BOOST_CHECK(t31 <= t33 && t32 <= t33);
     BOOST_CHECK(t21 <= t31 && t23 <= t32);
 
     auto evt4 = queue.submit([&](cl::sycl::handler &cgh) {
@@ -143,7 +150,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t53 =
         evt5.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t51 <= t52 && t52 <= t53);
+    BOOST_CHECK(t51 <= t53 && t52 <= t53);
 
     // re-ordered
     auto t41 = evt4.get_profiling_info<
@@ -152,7 +159,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t43 =
         evt4.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t41 <= t42 && t42 <= t43);
+    BOOST_CHECK(t41 <= t43 && t42 <= t43);
 
     // usm
     auto *src = cl::sycl::malloc_shared<int>(n, queue);
@@ -166,7 +173,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t63 =
         evt6.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t61 <= t62 && t62 <= t63);
+    BOOST_CHECK(t61 <= t63 && t62 <= t63);
 
     auto evt7 = queue.submit(
         [&](cl::sycl::handler &cgh) { cgh.memcpy(dest, src, sizeof src); });
@@ -176,7 +183,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t73 =
         evt7.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t71 <= t72 && t72 <= t73);
+    BOOST_CHECK(t71 <= t73 && t72 <= t73);
 
     auto evt8 = queue.submit(
         [&](cl::sycl::handler &cgh) { cgh.prefetch(dest, sizeof src); });
@@ -187,7 +194,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
     auto t83 =
         evt8.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
     // run time may be zero if prefetching is a no-op
-    BOOST_CHECK(t81 <= t82 && t82 <= t83);
+    BOOST_CHECK(t81 <= t83 && t82 <= t83);
 
     cl::sycl::free(src, queue);
     cl::sycl::free(dest, queue);

From bde71ac69c2580e19c01be0dfe78cdb4672f49d3 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sun, 11 Aug 2024 00:07:29 +0200
Subject: [PATCH 016/126] queue::wait(): Don't call flush_sync for non-emulated
 in-order queue when no non-instant operation was executed

---
 include/hipSYCL/sycl/queue.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/hipSYCL/sycl/queue.hpp b/include/hipSYCL/sycl/queue.hpp
index 4759c73b4..bef9dc022 100644
--- a/include/hipSYCL/sycl/queue.hpp
+++ b/include/hipSYCL/sycl/queue.hpp
@@ -153,6 +153,8 @@ class queue : public detail::property_carrying_object
 
     // Prevents kernel cache from becoming invalid while we have a queue
     std::shared_ptr<rt::kernel_cache> kernel_cache;
+    // For non-emulated in-order queues only
+    std::atomic<bool> has_non_instant_operations = false;
   };
 
   template<typename, int, access::mode, access::target>
@@ -355,7 +357,8 @@ class queue : public detail::property_carrying_object
         assert(exec);
         // Need to ensure everything is submitted before waiting on the stream
         // in case we have non-instant operations
-        _impl->requires_runtime.get()->dag().flush_sync();
+        if(_impl->has_non_instant_operations.load(std::memory_order_relaxed))
+          _impl->requires_runtime.get()->dag().flush_sync();
         
         auto err = exec->wait();
         if(!err.is_success()) {
@@ -1047,6 +1050,7 @@ class queue : public detail::property_carrying_object
       if(_impl->needs_in_order_emulation) {
         _impl->previous_submission = node;
       } else if(cgh.contains_non_instant_nodes()) {
+        _impl->has_non_instant_operations.store(true, std::memory_order_relaxed);
         // If we have instant submission enabled, non-emulated in-order queue
         // but non-instant tasks, we need to flush the dag, otherwise future instant
         // tasks might not wait on the tasks that have been cached in the dag

From eb9812aef75d1645338bbde96ffcc006ce89f7ea Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Wed, 31 Jul 2024 21:12:19 +0200
Subject: [PATCH 017/126] Fix support for Clang 19

---
 include/hipSYCL/compiler/cbs/IRUtils.hpp                  | 1 +
 include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp   | 1 +
 src/compiler/cbs/VectorizationInfo.cpp                    | 1 +
 .../llvm-to-backend/DeadArgumentEliminationPass.cpp       | 1 +
 .../llvm-to-backend/GlobalInliningAttributorPass.cpp      | 2 ++
 .../llvm-to-backend/GlobalSizesFitInI32OptPass.cpp        | 3 ++-
 src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp    | 3 ++-
 src/compiler/llvm-to-backend/LLVMToBackend.cpp            | 8 ++++----
 src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp        | 2 +-
 src/compiler/sscp/StdAtomicRemapperPass.cpp               | 1 +
 src/compiler/sscp/StdBuiltinRemapperPass.cpp              | 4 ++++
 src/compiler/stdpar/SyncElision.cpp                       | 2 +-
 12 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/include/hipSYCL/compiler/cbs/IRUtils.hpp b/include/hipSYCL/compiler/cbs/IRUtils.hpp
index ebbd8520d..7fffdef22 100644
--- a/include/hipSYCL/compiler/cbs/IRUtils.hpp
+++ b/include/hipSYCL/compiler/cbs/IRUtils.hpp
@@ -15,6 +15,7 @@
 
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/IR/Constants.h>
+#include <llvm/IR/Module.h>
 
 namespace llvm {
 class Region;
diff --git a/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp b/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp
index fdee51bfa..037f4fd42 100644
--- a/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp
+++ b/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp
@@ -18,6 +18,7 @@
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Value.h>
+#include <llvm/IR/DataLayout.h>
 
 namespace hipsycl::compiler {
 using SmallValVec = llvm::SmallVector<const llvm::Value *, 2>;
diff --git a/src/compiler/cbs/VectorizationInfo.cpp b/src/compiler/cbs/VectorizationInfo.cpp
index 72c439776..6c9040cc4 100644
--- a/src/compiler/cbs/VectorizationInfo.cpp
+++ b/src/compiler/cbs/VectorizationInfo.cpp
@@ -17,6 +17,7 @@
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/Instruction.h>
+#include <llvm/IR/Module.h>
 
 using namespace llvm;
 
diff --git a/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp b/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp
index d8362fd62..68d8b8ea4 100644
--- a/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp
+++ b/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp
@@ -13,6 +13,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/Constants.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
 namespace hipsycl {
diff --git a/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp b/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp
index 52671d980..22f6b57fe 100644
--- a/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp
+++ b/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp
@@ -10,6 +10,8 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/llvm-to-backend/GlobalInliningAttributorPass.hpp"
 
+#include <llvm/IR/Module.h>
+
 namespace hipsycl {
 namespace compiler {
 
diff --git a/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp b/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp
index 0faf0fceb..dffdaf95a 100644
--- a/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp
+++ b/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp
@@ -13,6 +13,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/Constants.h>
+#include <llvm/IR/Module.h>
 
 namespace hipsycl {
 namespace compiler {
@@ -124,4 +125,4 @@ llvm::PreservedAnalyses GlobalSizesFitInI32OptPass::run(llvm::Module &M,
   return llvm::PreservedAnalyses::none();
 }
 }
-}
\ No newline at end of file
+}
diff --git a/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp b/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp
index e8fa62887..888415f3d 100644
--- a/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp
+++ b/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp
@@ -13,6 +13,7 @@
 
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
 
 
 namespace hipsycl {
@@ -93,4 +94,4 @@ llvm::PreservedAnalyses KnownGroupSizeOptPass::run(llvm::Module &M,
 
 
 }
-}
\ No newline at end of file
+}
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index 349aedf19..b7a5038a6 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -66,7 +66,7 @@ bool linkBitcode(llvm::Module &M, std::unique_ptr<llvm::Module> OtherM,
 void setFastMathFunctionAttribs(llvm::Module& M) {
   auto forceAttr = [&](llvm::Function& F, llvm::StringRef Key, llvm::StringRef Value) {
     if(F.hasFnAttribute(Key)) {
-      if(!F.getFnAttribute(Key).getValueAsString().equals(Value))
+      if(F.getFnAttribute(Key).getValueAsString() != Value)
         F.removeFnAttr(Key);
     }
     F.addFnAttr(Key, Value);
@@ -343,11 +343,11 @@ bool LLVMToBackendTranslator::optimizeFlavoredIR(llvm::Module& M, PassHandler& P
 
   // silence optimization remarks,..
   M.getContext().setDiagnosticHandlerCallBack(
-      [](const llvm::DiagnosticInfo &DI, void *Context) {
+      [](const llvm::DiagnosticInfo *DI, void *Context) {
         llvm::DiagnosticPrinterRawOStream DP(llvm::errs());
-        if (DI.getSeverity() == llvm::DS_Error) {
+        if (DI->getSeverity() == llvm::DS_Error) {
           llvm::errs() << "LLVMToBackend: Error: ";
-          DI.print(DP);
+          DI->print(DP);
           llvm::errs() << "\n";
         }
       });
diff --git a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
index 0b6ebbe4a..d33522d2e 100644
--- a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
+++ b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
@@ -195,7 +195,7 @@ bool LLVMToSpirvTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
           // pointers.
           auto* CalledF = CB->getCalledFunction();
           if (llvmutils::starts_with(CalledF->getName(), "llvm.lifetime.start") ||
-	      llvmutils::starts_with(CalledF->getName(), "llvm.lifetime.end")) {
+              llvmutils::starts_with(CalledF->getName(), "llvm.lifetime.end")) {
             if(CB->getNumOperands() > 1 && CB->getArgOperand(1)->getType()->isPointerTy())
               if (CB->getArgOperand(1)->getType()->getPointerAddressSpace() ==
                   ASMap[AddressSpace::Generic])
diff --git a/src/compiler/sscp/StdAtomicRemapperPass.cpp b/src/compiler/sscp/StdAtomicRemapperPass.cpp
index dad04c16a..9f0ab6436 100644
--- a/src/compiler/sscp/StdAtomicRemapperPass.cpp
+++ b/src/compiler/sscp/StdAtomicRemapperPass.cpp
@@ -14,6 +14,7 @@
 
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Support/AtomicOrdering.h>
 
 #include <string>
diff --git a/src/compiler/sscp/StdBuiltinRemapperPass.cpp b/src/compiler/sscp/StdBuiltinRemapperPass.cpp
index 9fd6b74c9..ce633c989 100644
--- a/src/compiler/sscp/StdBuiltinRemapperPass.cpp
+++ b/src/compiler/sscp/StdBuiltinRemapperPass.cpp
@@ -10,6 +10,10 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/sscp/StdBuiltinRemapperPass.hpp"
 #include "hipSYCL/common/debug.hpp"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+
 #include <unordered_set>
 #include <unordered_map>
 
diff --git a/src/compiler/stdpar/SyncElision.cpp b/src/compiler/stdpar/SyncElision.cpp
index e106fb204..1d192d2ca 100644
--- a/src/compiler/stdpar/SyncElision.cpp
+++ b/src/compiler/stdpar/SyncElision.cpp
@@ -202,7 +202,7 @@ void forEachReachableInstructionRequiringSync(
   while(Current) {
     if(auto* CB = llvm::dyn_cast<llvm::CallBase>(Current)) {
       llvm::Function* CalledF = CB->getCalledFunction();
-      if(CalledF->getName().equals(BarrierBuiltinName)) {
+      if(CalledF->getName() == BarrierBuiltinName) {
         // basic block already contains barrier; nothing to do
         return;
       }

From f1e8a252a5ad77f36df7a532326a1e5216632fef Mon Sep 17 00:00:00 2001
From: Nils Friess <nils.friess@gmail.com>
Date: Wed, 14 Aug 2024 11:57:15 +0200
Subject: [PATCH 018/126] Use different type of lambda argument depending on
 LLVM version

---
 src/compiler/llvm-to-backend/LLVMToBackend.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index b7a5038a6..b778c2536 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -101,7 +101,7 @@ class InstructionCleanupPass : public llvm::PassInfoMixin<InstructionCleanupPass
             // even without dynamic allocas, but they are generally unsupported on device
             // backends.
             if (llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.stacksave") ||
-		llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.stackrestore"))
+                llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.stackrestore"))
               CallsToRemove.push_back(CB);
           }
         }
@@ -343,6 +343,7 @@ bool LLVMToBackendTranslator::optimizeFlavoredIR(llvm::Module& M, PassHandler& P
 
   // silence optimization remarks,..
   M.getContext().setDiagnosticHandlerCallBack(
+#if LLVM_VERSION_MAJOR >= 19
       [](const llvm::DiagnosticInfo *DI, void *Context) {
         llvm::DiagnosticPrinterRawOStream DP(llvm::errs());
         if (DI->getSeverity() == llvm::DS_Error) {
@@ -351,6 +352,16 @@ bool LLVMToBackendTranslator::optimizeFlavoredIR(llvm::Module& M, PassHandler& P
           llvm::errs() << "\n";
         }
       });
+#else
+      [](const llvm::DiagnosticInfo &DI, void *Context) {
+        llvm::DiagnosticPrinterRawOStream DP(llvm::errs());
+        if (DI.getSeverity() == llvm::DS_Error) {
+          llvm::errs() << "LLVMToBackend: Error: ";
+          DI.print(DP);
+          llvm::errs() << "\n";
+        }
+      });
+#endif
 
   llvm::ModulePassManager MPM =
       PH.PassBuilder->buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);

From 97a0903118240c4b30dc65afb1ceffb02d274f0b Mon Sep 17 00:00:00 2001
From: Marco Julian Solanki <msolanki@student.ethz.ch>
Date: Fri, 20 Sep 2024 16:00:06 +0200
Subject: [PATCH 019/126] Replace spurious references to hipSYCL in `acpp
 --help`

---
 bin/acpp | 144 +++++++++++++++++++++++++++----------------------------
 1 file changed, 72 insertions(+), 72 deletions(-)

diff --git a/bin/acpp b/bin/acpp
index 236834883..3891749e7 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -44,7 +44,7 @@ class hcf_node:
   @property
   def subnodes(self):
     return self._subnodes
-  
+
   def make_subnode(self, name):
     n = hcf_node(name, self._nesting_level+1)
     self._subnodes.append(n)
@@ -58,7 +58,7 @@ class hcf_node:
   @property
   def values(self):
     return self._key_value_pairs
-  
+
   @property
   def name(self):
     return self._node_name
@@ -69,12 +69,12 @@ class hcf_node:
 
     for k in self._key_value_pairs:
       result += "{}{}={}\n".format(indent,k, self._key_value_pairs[k])
-    
+
     for n in self._subnodes:
       result += indent + "{." + n.name + "\n"
       result += str(n)
       result += indent + "}." + n.name + "\n"
-    
+
     return result
 
 class hcf_generator:
@@ -102,7 +102,7 @@ class hcf_generator:
   # Return non-binary readable part
   def __str__(self) -> str:
     return str(self._root) + "__acpp_hcf_binary_appendix"
-  
+
   @property
   def bytes(self):
     result = str(self).encode("utf-8")
@@ -113,7 +113,7 @@ class hcf_generator:
   @property
   def escaped_bytes(self):
     hex = binascii.hexlify(self.bytes).decode("utf-8")
-    return ",".join(["0x" + hex[i:i+2] 
+    return ",".join(["0x" + hex[i:i+2]
         for i in range(0,len(hex),2)])
 
 
@@ -124,14 +124,14 @@ class integration_header:
     self._hcf.root.values["object-id"] = self._object_id
     self._hcf.root.values["generator"] = "syclcc"
     self._backend = backend_name
-  
+
   @property
   def hcf_object(self):
     return self._hcf
-  
+
   def __str__(self) -> str:
     hcf_string = self._hcf.escaped_bytes
-    
+
     header = """
 #ifndef ACPP_{capital_name}_INTEGRATION_HEADER
 #define ACPP_{capital_name}_INTEGRATION_HEADER
@@ -142,12 +142,12 @@ ACPP_STATIC_HCF_REGISTRATION({hcf_object_id}ull, __acpp_hcf_object_{hcf_object_i
 
 #endif
 """.format(
-      capital_name = self._backend.upper(), 
+      capital_name = self._backend.upper(),
       name = self._backend.lower(),
       hcf_object_id = self._object_id,
       hcf_size = len(self._hcf.bytes),
       hcf_binary = hcf_string)
-    
+
     return header
 
   def write_header(self, filename):
@@ -157,7 +157,7 @@ ACPP_STATIC_HCF_REGISTRATION({hcf_object_id}ull, __acpp_hcf_object_{hcf_object_i
 class config_db:
   # Scans the provided directory for json files
   def __init__(self, config_file_dirs):
-    
+
     self._data = {}
     self._locations = {}
     self._config_dirs = config_file_dirs
@@ -236,15 +236,15 @@ class acpp_config:
     # 3.) the field in the config file.
     self._options = {
       'platform': option("--acpp-platform", "ACPP_PLATFORM", "default-platform",
-"""  (deprecated) The platform that hipSYCL should target. Valid values:
+"""  (deprecated) The platform that AdaptiveCpp should target. Valid values:
     * cuda: Target NVIDIA CUDA GPUs
     * rocm: Target AMD GPUs running on the ROCm platform
     * cpu: Target only CPUs"""),
 
       'clang': option("--acpp-clang", "ACPP_CLANG", "default-clang",
 """  The path to the clang executable that should be used for compilation
-    (Note: *must* be compatible with the clang version that the 
-     hipSYCL clang plugin was compiled against!)"""),
+    (Note: *must* be compatible with the clang version that the
+     AdaptiveCpp clang plugin was compiled against!)"""),
 
       'nvcxx': option("--acpp-nvcxx", "ACPP_NVCXX", "default-nvcxx",
 """  The path to the nvc++ executable that should be used for compilation
@@ -263,7 +263,7 @@ class acpp_config:
 
       'cpu-compiler': option("--acpp-cpu-cxx", "ACPP_CPU_CXX", "default-cpu-cxx",
 """  The compiler that should be used when targeting only CPUs."""),
-      
+
       'clang-include-path' : option("--acpp-clang-include-path", "ACPP_CLANG_INCLUDE_PATH", "default-clang-include-path",
 """  The path to clang's internal include headers. Typically of the form $PREFIX/include/clang/<version>/include. Only required by ROCm."""),
 
@@ -294,7 +294,7 @@ class acpp_config:
       'config-file-dir' : option("--acpp-config-file-dir", "ACPP_CONFIG_FILE_DIR", "default-config-file-dir",
 """  Select an alternative path for the config files containing the default AdaptiveCpp settings.
     It is normally not necessary for the user to change this setting. """),
-    
+
       'targets': option("--acpp-targets", "ACPP_TARGETS", "default-targets",
 """  Specify backends and targets to compile for. Example: --acpp-targets='omp;hip:gfx900,gfx906'
     Available backends:
@@ -304,11 +304,11 @@ class acpp_config:
                                    Uses Boost.Fiber for nd_range parallel_for support.
                - omp.accelerated: Uses clang as host compiler to enable compiler support
                                   for nd_range parallel_for (see --acpp-use-accelerated-cpu).
-      * cuda - CUDA backend 
+      * cuda - CUDA backend
                Requires specification of targets of the form sm_XY,
                e.g. sm_70 for Volta, sm_60 for Pascal
                Backend Flavors:
-               - cuda.explicit-multipass: CUDA backend in explicit multipass mode 
+               - cuda.explicit-multipass: CUDA backend in explicit multipass mode
                                           (see --acpp-explicit-multipass)
                - cuda.integrated-multipass: Force CUDA backend to operate in integrated
                                            multipass mode.
@@ -324,7 +324,7 @@ class acpp_config:
                                            multipass mode.
       * generic - use generic LLVM SSCP compilation flow, and JIT at runtime to target device"""),
 
-      'stdpar-prefetch-mode' : option("--acpp-stdpar-prefetch-mode", "ACPP_STDPAR_PREFETCH_MODE", "default-stdpar-prefetch-mode", 
+      'stdpar-prefetch-mode' : option("--acpp-stdpar-prefetch-mode", "ACPP_STDPAR_PREFETCH_MODE", "default-stdpar-prefetch-mode",
 """  AdaptiveCpp supports issuing automatic USM prefetch operations for allocations used inside offloaded C++ PSTL
     algorithms. This flags determines the strategy for submitting such prefetches.
     Supported values are:
@@ -344,7 +344,7 @@ class acpp_config:
   of a work-group in a single thread, eliminating scheduling overhead
   and enabling enhanced vectorization opportunities compared to the fiber variant."""),
       'is-dryrun': option("--acpp-dryrun", "ACPP_DRYRUN", "default-is-dryrun",
-"""  If set, only shows compilation commands that would be executed, 
+"""  If set, only shows compilation commands that would be executed,
   but does not actually execute it. """),
       'is-explicit-multipass': option("--acpp-explicit-multipass", "ACPP_EXPLICIT_MULTIPASS",
       "default-is-explicit-multipass",
@@ -354,13 +354,13 @@ class acpp_config:
   For example, you cannot use the CUDA kernel launch syntax[i.e. kernel <<< ... >>> (...)] in this mode. """),
       'should-save-temps': option("--acpp-save-temps", "ACPP_SAVE_TEMPS", "default-save-temps",
 """  If set, do not delete temporary files created during compilation."""),
-      'stdpar' : option("--acpp-stdpar", "ACPP_STDPAR", "default-is-stdpar", 
+      'stdpar' : option("--acpp-stdpar", "ACPP_STDPAR", "default-is-stdpar",
 """  If set, enables SYCL offloading of C++ standard parallel algorithms."""),
-      'stdpar-system-usm' : option("--acpp-stdpar-system-usm", "ACPP_STDPAR_SYSTEM_USM", "default-is-stdpar-system-usm", 
+      'stdpar-system-usm' : option("--acpp-stdpar-system-usm", "ACPP_STDPAR_SYSTEM_USM", "default-is-stdpar-system-usm",
 """  If set, assume availability of system-level unified shared memory where every pointer from regular
   malloc() is accessible on GPU. This disables automatic hijacking of memory allocations at the compiler
   level by AdaptiveCpp."""),
-      'stdpar-unconditional-offload' : option("--acpp-stdpar-unconditional-offload", "ACPP_STDPAR_UNCONDITIONAL_OFFLOAD", "default-is-stdpar-unconditional-offload", 
+      'stdpar-unconditional-offload' : option("--acpp-stdpar-unconditional-offload", "ACPP_STDPAR_UNCONDITIONAL_OFFLOAD", "default-is-stdpar-unconditional-offload",
 """  Normally, heuristics are employed to determine whether algorithms should be offloaded.
   This particularly affects small problem sizes. If this flag is set, supported parallel STL
   algorithms will be offloaded unconditionally.""")
@@ -375,7 +375,7 @@ class acpp_config:
     self._targets = None
     self._cxx_path = None
     self._clang_path = None
-    
+
     for arg in self._args:
       if self._is_acpp_arg(arg):
         self._acpp_args.append(arg)
@@ -383,22 +383,22 @@ class acpp_config:
         self._acpp_args.append(self._upgrade_legacy_arg(arg))
       else:
         self._forwarded_args.append(arg)
-    
+
     for envvar in os.environ:
       if self._is_acpp_envvar(envvar):
         self._acpp_environment_args[envvar] = os.environ[envvar]
       elif self._is_acpp_envvar(self._upgrade_legacy_environ_var(envvar)):
         self._acpp_environment_args[self._upgrade_legacy_environ_var(envvar)] = os.environ[envvar]
-    
+
     config_file_directories = []
 
     install_config_dir = os.path.abspath(
       os.path.join(self.acpp_installation_path,
                   "etc/AdaptiveCpp"))
-    
+
     # TODO try using some more portable path here
     global_config_dir = '/etc/AdaptiveCpp'
-    
+
     if self._is_option_set_to_non_default_value("config-file-dir"):
       config_file_directories.append(self._retrieve_option("config-file-dir"))
     elif os.path.exists(install_config_dir):
@@ -407,7 +407,7 @@ class acpp_config:
       config_file_directories.append(global_config_dir)
     self._config_db = config_db(config_file_directories)
 
-    
+
     self._common_compiler_args = self._get_std_compiler_args()
 
 
@@ -428,7 +428,7 @@ class acpp_config:
       if arg.startswith(accepted_arg + "=") or arg == accepted_arg:
         return True
     return False
-  
+
   def _is_acpp_envvar(self, varname):
     accepted_vars = [self._options[opt].environment for opt in self._options]
     accepted_vars += [self._flags[flag].environment for flag in self._flags]
@@ -481,7 +481,7 @@ class acpp_config:
     for arg in self._acpp_args:
       if arg == flag.commandline:
         return True
-      
+
       if arg.startswith(flag.commandline + "="):
         return self._interpret_flag(arg.split("=")[1])
 
@@ -537,7 +537,7 @@ class acpp_config:
   def _substitute_rocm_template_string(self, template_string):
     return self._substitute_template_string(
       template_string, self._get_rocm_substitution_vars())
-      
+
   def _substitute_cuda_template_string(self, template_string):
     return self._substitute_template_string(
       template_string, self._get_cuda_substitution_vars())
@@ -573,7 +573,7 @@ class acpp_config:
     # Try config db
     if self._config_db.contains_key(opt.config_db):
       return self._config_db.get(opt.config_db)
-  
+
     if not allow_unset:
       raise OptionNotSet("Required command line argument {} or environment variable {} not specified".format(
             opt.commandline, opt.environment))
@@ -600,13 +600,13 @@ class acpp_config:
   def _parse_targets(self, target_arg):
     # Split backends by ;
     platform_substrings = target_arg.replace("'","").replace('"',"").split(';')
-    
+
     result = {}
     for p in platform_substrings:
       platform_target_separated = p.split(':', 1)
       if len(platform_target_separated) > 2 or len(platform_target_separated) == 0:
         raise RuntimeError("Invalid target description: " + p)
-      
+
       platform = platform_target_separated[0].strip().lower()
 
       if not platform in result:
@@ -619,7 +619,7 @@ class acpp_config:
             result[platform].append(t)
 
     return result
-  
+
   def _get_executable_path(self, path):
     normalized_path = shutil.which(path)
     if normalized_path:
@@ -628,21 +628,21 @@ class acpp_config:
 
   @property
   def version(self):
-  
+
     if not self._config_db.contains_key("version-major"):
       raise OptionNotSet("Could not retrieve major version from config file")
     if not self._config_db.contains_key("version-minor"):
       raise OptionNotSet("Could not retrieve major version from config file")
     if not self._config_db.contains_key("version-patch"):
       raise OptionNotSet("Could not retrieve major version from config file")
-    
+
     # version suffix may be empty if git queries fail
     suffix = ""
     if self._config_db.contains_key("version-suffix"):
       suffix = self._config_db.get("version-suffix")
 
     return (
-      self._config_db.get("version-major"), 
+      self._config_db.get("version-major"),
       self._config_db.get("version-minor"),
       self._config_db.get("version-patch"),
       suffix)
@@ -671,7 +671,7 @@ class acpp_config:
 
   @property
   def targets(self):
-    
+
     if self._targets == None:
       raw_target_string = ""
       try:
@@ -689,11 +689,11 @@ class acpp_config:
           if platform in hip_platform_synonyms:
             target_arch = self._retrieve_option("gpu-arch")
             raw_target_string = "hip:" + target_arch
-            
+
           elif platform in cuda_platform_synonyms:
             target_arch = self._retrieve_option("gpu-arch")
             raw_target_string = "cuda:" + target_arch
-            
+
           elif platform in pure_cpu_platform_synonyms:
             raw_target_string = "omp"
         except OptionNotSet:
@@ -804,7 +804,7 @@ class acpp_config:
       return self._is_flag_set("use-accelerated-cpu")
     except OptionNotSet:
       return False
-  
+
   @property
   def is_explicit_multipass(self):
     try:
@@ -832,7 +832,7 @@ class acpp_config:
       return self._is_flag_set("stdpar-unconditional-offload")
     except OptionNotSet:
       return False
-  
+
   @property
   def stdpar_prefetch_mode(self):
     return self._retrieve_option("stdpar-prefetch-mode")
@@ -887,7 +887,7 @@ class acpp_config:
         if ending.isnumeric() or ending in ["s", "fast", "g"]:
           return True
     return False
-  
+
   def contains_linking_stage(self):
     for arg in self.forwarded_compiler_arguments:
       if (arg == "-E" or
@@ -976,12 +976,12 @@ class cuda_multipass_invocation:
         'Unnamed kernel lambdas are unsupported in this configuration because the selected host compiler '
         +self._host_compiler+' does not match the device compiler of the backend '+self.get_device_compiler()
         ]
-      
+
 
     return {
       'requires-extended-host-pass' : requires_extended_host_pass,
       'extended-host-pass-providers' : [
-        'cuda.explicit-multipass', 'hip.explicit-multipass', 
+        'cuda.explicit-multipass', 'hip.explicit-multipass',
         'cuda.integrated-multipass', 'hip.integrated-multipass'],
       'conflicts' : [],
       'caveats' : caveats
@@ -1046,7 +1046,7 @@ class cuda_multipass_invocation:
     for target,ptx in zip(targets, ptx_content):
       target_node = header.hcf_object.root.make_subnode(target)
       header.hcf_object.attach_text_content(target_node, ptx)
-    
+
     header.write_header(self._integration_header)
 
 class hip_multipass_invocation:
@@ -1103,12 +1103,12 @@ class hip_multipass_invocation:
         'Unnamed kernel lambdas are unsupported in this configuration because the selected host compiler '
         +self._host_compiler+' does not match the device compiler of the backend '+self.get_device_compiler()
         ]
-      
+
 
     return {
       'requires-extended-host-pass' : requires_extended_host_pass,
       'extended-host-pass-providers' : [
-        'cuda.explicit-multipass', 'hip.explicit-multipass', 
+        'cuda.explicit-multipass', 'hip.explicit-multipass',
         'cuda.integrated-multipass', 'hip.integrated-multipass'],
       'conflicts' : [],
       'caveats' : caveats
@@ -1171,7 +1171,7 @@ class hip_multipass_invocation:
     for target,hipfb in zip(targets, hipfb_content):
       target_node = header.hcf_object.root.make_subnode(target)
       header.hcf_object.attach_binary_content(target_node, hipfb)
-    
+
     header.write_header(self._integration_header)
 
 
@@ -1228,7 +1228,7 @@ class cuda_invocation:
 
     if not sys.platform.startswith("win32"):
       flags += ["-fpass-plugin=" + self._acpp_plugin_path]
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1282,7 +1282,7 @@ class cuda_nvcxx_invocation:
     except OptionNotSet:
       # nvc++ can handle not setting targets explicitly
       pass
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1330,13 +1330,13 @@ class hip_invocation:
         "-fplugin=" + self._acpp_plugin_path,
         "-D__ACPP_CLANG__"
       ]
-    
+
     for t in self._hip_targets:
       flags += ["--cuda-gpu-arch=" + t]
 
     if not sys.platform.startswith("win32"):
       flags += ["-fpass-plugin=" + self._acpp_plugin_path]
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1379,7 +1379,7 @@ class omp_invocation:
   def get_cxx_flags(self):
     flags = ["-D__ACPP_ENABLE_OMPHOST_TARGET__"]
     flags += self._cxx_flags
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1523,7 +1523,7 @@ class llvm_sscp_invocation:
             "-Xclang", "-disable-O0-optnone", "-mllvm", "-acpp-sscp"]
 
     sscp_compile_opts = []
-    if ("-Ofast" in self._config.forwarded_compiler_arguments or 
+    if ("-Ofast" in self._config.forwarded_compiler_arguments or
       "-ffast-math" in self._config.forwarded_compiler_arguments):
       sscp_compile_opts.append("fast-math")
 
@@ -1634,7 +1634,7 @@ class compiler:
         raise RuntimeError("Unknown backend: " + backend)
 
     self._backends += self._multipass_backends
-    
+
     self._host_compiler = self._select_compiler()
     for mb in self._multipass_backends:
       mb.set_host_compiler(self._host_compiler)
@@ -1647,7 +1647,7 @@ class compiler:
 
     self._verify_backend_combinations()
 
-    # Take into account extended host pass requirements for 
+    # Take into account extended host pass requirements for
     # explicit multipass. E.g., CUDA explicit multipass requires
     # -x cuda or -x hip in the host pass.
     # The "extended host pass" concept is an abstraction of the fact
@@ -1671,7 +1671,7 @@ class compiler:
         print_error("backend",b, "appears multiple times in processed target specification")
 
       reqs = b.get_host_pass_requirements()
-      
+
       conflicts = reqs['conflicts']
       caveats = reqs['caveats']
       for c in caveats:
@@ -1681,7 +1681,7 @@ class compiler:
         if c in selected_backends:
           print_error("requested backends",b.unique_name, "and",c,"are incompatible.")
           fatal_error = True
-    
+
     if fatal_error:
       raise RuntimeError("Errors encountered while verifying combination of requested backends.")
 
@@ -1695,12 +1695,12 @@ class compiler:
 
     if host_pass_reqs['requires-extended-host-pass']:
       extended_pass_providers = host_pass_reqs['extended-host-pass-providers']
-      
+
       available_providers = []
       for provider in extended_pass_providers:
         if provider in active_backends:
           available_providers.append(provider)
-      
+
       # If there is already an integrated multipass backend running
       # that already provides the flags, or if an explicit multipass
       # provider is already enabled, there is nothing to do
@@ -1709,7 +1709,7 @@ class compiler:
           return
         elif active_backends[p].is_extended_host_pass_enabled:
           return
-      
+
       # Otherwise, we need to select and enable an explicit multipass
       # provider. Currently we always select the backend we are configuring.
       # TODO make this user configurable, especially if we add HIP explicit multipass
@@ -1758,12 +1758,12 @@ class compiler:
         "-mllvm", "-acpp-stdpar",
         "-include", os.path.join(stdpar_include_path, "detail", "sycl_glue.hpp")
       ]
-      
+
       if self._is_stdpar_system_usm:
         args += ["-mllvm", "-acpp-stdpar-no-malloc-to-usm", "-D__ACPP_STDPAR_ASSUME_SYSTEM_USM__"]
       if self._is_stdpar_unconditional_offload:
         args += ["-D__ACPP_STDPAR_UNCONDITIONAL_OFFLOAD__"]
-      
+
       if self._stdpar_prefetch_mode != None:
         prefetch_mode_string = self._stdpar_prefetch_mode
         prefetch_mode_id = 0
@@ -1780,7 +1780,7 @@ class compiler:
           prefetch_mode_id = 4
         else:
           raise RuntimeError("Invalid value for stdpar-prefetch-mode: "+prefetch_mode_string)
-        
+
         args += ["-D__ACPP_STDPAR_PREFETCH_MODE__="+str(prefetch_mode_id)]
 
     return args + self._common_compiler_args
@@ -1791,7 +1791,7 @@ class compiler:
       "-L"+self._acpp_lib_path,
       "-lacpp-rt"
     ]
-    
+
     if sys.platform == "darwin":
       linker_args.append("-Wl,-rpath")
       linker_args.append(self._acpp_lib_path)
@@ -1831,7 +1831,7 @@ class compiler:
       if priority > compiler_priority:
         compiler_executable = cxx
         compiler_priority = priority
-    
+
     return compiler_executable
 
   def _flag_should_be_unique(self, flag):
@@ -1934,7 +1934,7 @@ if __name__ == '__main__':
   if sys.version_info[0] < 3:
     print_error("acpp requires python 3.")
     sys.exit(-1)
-  
+
   filename = os.path.basename(os.path.realpath(__file__))
   if filename == "syclcc":
     print_warning("syclcc is deprecated; please use acpp instead.")
@@ -1968,7 +1968,7 @@ if __name__ == '__main__':
         print_warning("No optimization flag was given, optimizations are "
               "disabled by default. Performance may be degraded. Compile with e.g. -O2/-O3 to "
               "enable optimizations.")
-    
+
     c = compiler(config)
     sys.exit(c.run())
   except Exception as e:

From e6f2dde392daf55c7e07a0b18068ba2818aa6857 Mon Sep 17 00:00:00 2001
From: Arpan <105904949+Arpan3323@users.noreply.github.com>
Date: Sun, 22 Sep 2024 03:30:59 -0500
Subject: [PATCH 020/126] Update version.hpp

---
 include/hipSYCL/sycl/version.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hipSYCL/sycl/version.hpp b/include/hipSYCL/sycl/version.hpp
index d329026c9..3f12ad85d 100644
--- a/include/hipSYCL/sycl/version.hpp
+++ b/include/hipSYCL/sycl/version.hpp
@@ -22,7 +22,7 @@ namespace detail {
 static std::string version_string()
 {
   std::string version = std::to_string(ACPP_VERSION_MAJOR)
-      + "." + std::to_string(ACPP_VERSION_MINOR)
+      + "." + "0" + std::to_string(ACPP_VERSION_MINOR)
       + "." + std::to_string(ACPP_VERSION_PATCH)
       + std::string(ACPP_VERSION_SUFFIX);
 

From f41964f848982be0d00a28fa631408e26409333a Mon Sep 17 00:00:00 2001
From: Arpan <105904949+Arpan3323@users.noreply.github.com>
Date: Mon, 23 Sep 2024 04:09:42 -0500
Subject: [PATCH 021/126] added check for adding a leading zero

---
 include/hipSYCL/sycl/version.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/hipSYCL/sycl/version.hpp b/include/hipSYCL/sycl/version.hpp
index 3f12ad85d..887055395 100644
--- a/include/hipSYCL/sycl/version.hpp
+++ b/include/hipSYCL/sycl/version.hpp
@@ -21,8 +21,9 @@ namespace detail {
 
 static std::string version_string()
 {
+  std::string zero = (ACPP_VERSION_MINOR < 10 ? "0" : "");
   std::string version = std::to_string(ACPP_VERSION_MAJOR)
-      + "." + "0" + std::to_string(ACPP_VERSION_MINOR)
+      + "." + zero + std::to_string(ACPP_VERSION_MINOR)
       + "." + std::to_string(ACPP_VERSION_PATCH)
       + std::string(ACPP_VERSION_SUFFIX);
 

From 680d600c6e0a4a24b19061643b694cdae394a5eb Mon Sep 17 00:00:00 2001
From: Marco Julian Solanki <msolanki@student.ethz.ch>
Date: Sat, 5 Oct 2024 21:30:05 +0200
Subject: [PATCH 022/126] Rework using-hipsycl.md

* Make punctuation, capitalisation (e.g. of "CMake") and formatting a bit more consistent

* Update the featured `acpp --help` output to 24.06

* Reword and rephrase some sentences to make them sound a bit more idiomatic
---
 doc/using-hipsycl.md | 92 ++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/doc/using-hipsycl.md b/doc/using-hipsycl.md
index 25020eba6..995d326ee 100644
--- a/doc/using-hipsycl.md
+++ b/doc/using-hipsycl.md
@@ -1,24 +1,25 @@
 # Using AdaptiveCpp in projects
-It is recommended to use the CMake integration for larger projects. See the section on the cmake integration below. Alternatively, `acpp` can be used directly as a compiler.
 
-## Using acpp
+It is recommended that the CMake integration be used for larger projects (see the section on the CMake integration below). Alternatively, `acpp` can be used directly as a compiler.
 
-`acpp` can be invoked like a regular compiler (e.g. `acpp -O3 -o test test.cpp`). It supports multiple compilation flows. In a typical installation (i.e. when AdaptiveCpp was built against LLVM >= 14 and the generic SSCP compiler was not explicitly disabled), it uses the `generic` compilation flow by default. This compilation flow usually compiles the fastest, produces the fastest binaries, and its generated binaries can run on all supported devices. **Unless you have have very specific needs, you probably should use the default `generic` compiler.**
+## Using `acpp`
 
-Advanced users or users with more specific needs may want to specify compilation flows explicitly.This is achieved with the `--acpp-targets="compilation-flow1:target1,target2,...;compilation-flow2:..."` command line argument, `ACPP_TARGETS` environment variable or cmake argument.
+`acpp` can be invoked like a regular compiler (e.g. `acpp -O3 -o test test.cpp`). It supports multiple compilation flows. A typical installation (i.e. when AdaptiveCpp was built against LLVM >= 14 and the generic SSCP compiler was not explicitly disabled) uses the `generic` compilation flow by default. This compilation flow usually compiles the quickest, produces the fastest binaries, and its generated binaries can run on all supported devices. **Unless you have very specific needs, you should probably use the default `generic` compiler.**
 
-**Other compilation flows like omp, cuda, hip are typically mostly interesting for backend interoperability use cases, not if performance is the top priority.**.
+Advanced users or users with more specific needs may want to specify compilation flows explicitly. This is achieved with the `--acpp-targets="compilation-flow1:target1,target2,...;compilation-flow2:..."` command line argument, the `ACPP_TARGETS` environment variable or the `ACPP_TARGETS` CMake variable.
+
+**Other compilation flows like `omp`, `cuda`, and `hip` are typically mostly attractive for backend interoperability use cases, not when performance is the primary concern.**
 
 ## AdaptiveCpp targets specification
 
-Both `acpp` and the cmake integration can be optionally provided with an AdaptiveCpp targets specification. This specification defines which compilation flows AdaptiveCpp should enable, and which devices from a compilation flow AdaptiveCpp should target during compilation. In general, it has the form:
+Both `acpp` and the CMake integration can optionally be provided with an AdaptiveCpp targets specification. This specification defines which compilation flows AdaptiveCpp should enable and which devices from a compilation flow AdaptiveCpp should target. In general, it has the form:
 
 ```
 "flow1:target1,target2,...;flow2:...;..."
 ```
-and can be passed either as `acpp` command line argument (`--acpp-targets=...`), environment variable (`ACPP_TARGETS=...`) or CMake argument (`-DACPP_TARGETS=...`) depending on whether `acpp` or `cmake` is used.
+and can be passed either as an `acpp` command line argument (`--acpp-targets=...`), environment variable (`ACPP_TARGETS=...`) or CMake argument (`-DACPP_TARGETS=...`) depending on whether `acpp` or `cmake` is used.
 
-"compilation flow" refers to one of the available compilation flows defined in the [compilation flow](compilation.md) documentation.
+"Compilation flow" refers to one of the available compilation flows defined in the [compilation documentation](compilation.md).
 
 
 ### Requirements for specifying targets of individual compilation flows
@@ -36,51 +37,51 @@ For the following compilation flows, targets can optionally be specified:
 
 For the following compilation flows, targets must be specified:
 
-* `cuda.*` - The target format is defined by clang and takes the format of `sm_XY`. For example:
-    * `sm_52`: NVIDIA Maxwell GPUs
-    * `sm_60`: NVIDIA Pascal GPUs
-    * `sm_70`: NVIDIA Volta GPUs
-* `hip.*` - The target format is defined by clang and takes the format of `gfxXYZ`. For example:
+* `cuda.*` - The target format is defined by `clang` and takes the format of `sm_XY`. For example:
+    * `sm_52`: NVIDIA Maxwell GPUs (e.g. GeForce GTX 980, TITAN X)
+    * `sm_61`: NVIDIA Pascal GPUs (e.g. GeForce GTX 1080, TITAN Xp)
+    * `sm_70`: NVIDIA Volta GPUs  (e.g. Tesla V100, TITAN V)
+* `hip.*` - The target format is defined by `clang` and takes the format of `gfxXYZ`. For example:
     * `gfx900`: AMD Vega 10 GPUs (e.g. Radeon Vega 56, Vega 64)
     * `gfx906`: AMD Vega 20 GPUs (e.g. Radeon VII, Instinct MI50)
-    * `gfx908`: AMD CDNA GPUs (e.g Instinct MI100)
+    * `gfx908`: AMD CDNA GPUs (e.g. Instinct MI100)
 
 ### Abbreviations
 
 For some compilation flows, abbreviations exist that will be resolved by AdaptiveCpp to one of the available compilation flows:
 
-* `omp` will be translated 
-    * into `omp.accelerated` 
-        * if AdaptiveCpp has been built with support for accelerated CPU and the host compiler is the clang that AdaptiveCpp has been built with or
+* `omp` will be translated
+    * into `omp.accelerated`
+        * if AdaptiveCpp has been built with support for accelerated CPU and the host compiler is the `clang` that AdaptiveCpp has been built with or
         * if `--acpp-use-accelerated-cpu` is set. If the accelerated CPU compilation flow is not available (e.g. AdaptiveCpp has been compiled without support for it), compilation will abort with an error.
-    * into `omp.library-only` otherwise
+    * into `omp.library-only` otherwise.
 * `cuda` will be translated
     * into `cuda.explicit-multipass`
-        * if another integrated multipass has been requested, or another backend that would conflict with `cuda.integrated-multipass`. AdaptiveCpp will emit a warning in this case, since switching to explicit multipass can change interoperability guarantees (see the [compilation](compilation.md) documentation).
-        * if `--acpp-explicit-multipass` is set explicitly
-    * into `cuda.integrated-multipass` otherwise
-* `hip` will be translated into `hip.integrated-multipass`
+        * if another integrated multipass has been requested, or another backend that would conflict with `cuda.integrated-multipass`. AdaptiveCpp will emit a warning in this case, since switching to explicit multipass can change interoperability guarantees (see the [compilation documentation](compilation.md)).
+        * if `--acpp-explicit-multipass` is set explicitly.
+    * into `cuda.integrated-multipass` otherwise.
+* `hip` will be translated into `hip.integrated-multipass`.
 
 Of course, the desired flows can also always be specified explicitly.
 
 ### Examples
 
-* `generic` - creates a binary that can run on all backends. This also typically creates the fastest binaries.
-* `"omp.library-only;cuda.explicit-multipass:sm_61;sm_70"`  - compiles for the CPU backend and Pascal and Volta era GPUs
-* `"omp;cuda:sm_70;hip:gfx906"`  - compiles for the CPU backend (library or accelerated), NVIDIA Volta era GPUs via explicit multipass, AMD Vega 20 GPUs
-* `"omp.accelerated;cuda:sm_70`" - compiles for the CPU backend (compiler accelerated) and NVIDIA Volta era GPUs.
-* `"omp;cuda-nvcxx"` - compiles for the CPU backend and NVIDIA GPUs using nvc++
+* `"generic"` - creates a binary that can run on all backends. This also typically creates the fastest binaries.
+* `"omp.library-only;cuda.explicit-multipass:sm_61;sm_70"` - compiles for the CPU backend and Pascal- and Volta-era GPUs.
+* `"omp;cuda:sm_70;hip:gfx906"` - compiles for the CPU backend (library or accelerated), NVIDIA Volta-era GPUs via explicit multipass and AMD Vega 20 GPUs.
+* `"omp.accelerated;cuda:sm_70"` - compiles for the CPU backend (compiler-accelerated) and NVIDIA Volta-era GPUs.
+* `"omp;cuda-nvcxx"` - compiles for the CPU backend and NVIDIA GPUs using `nvc++`.
 
 ### Offloading C++ standard parallelism
 
 See [here](stdpar.md) for details on how to offload C++ standard STL algorithms using AdaptiveCpp.
 
-## All the flags: acpp --help
+## All the flags: `acpp --help`
 
-The full excerpt from `acpp --help` follows below. Note the options can also be set via environment variables or corresponding CMake options. Default values can be set in the `/acpp/install/path/etc/AdaptiveCpp/*.json` files.
+The full output obtained when running `acpp --help` is provided below. Note that the options can also be set via environment variables or the corresponding CMake options. Default values can be set in the `/acpp/install/path/etc/AdaptiveCpp/*.json` files.
 ```
-acpp [AdaptiveCpp compilation driver], Copyright (C) 2018-2023 Aksel Alpay and the AdaptiveCpp project
-  AdaptiveCpp version: 23.10.0+git.2d0c6b6f.20240226.branch.develop
+acpp [AdaptiveCpp compilation driver], Copyright (C) 2018-2024 Aksel Alpay and the AdaptiveCpp project
+  AdaptiveCpp version: 24.06.0+git.8cf7a902.20241001.branch.develop
   Installation root: /install/path
   Plugin LLVM version: <version>, can accelerate CPU: <bool>
   Available runtime backends:
@@ -96,7 +97,7 @@ Options are:
   [can also be set with environment variable: ACPP_PLATFORM=<value>]
   [default value provided by field 'default-platform' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
-  (deprecated) The platform that hipSYCL should target. Valid values:
+  (deprecated) The platform that AdaptiveCpp should target. Valid values:
     * cuda: Target NVIDIA CUDA GPUs
     * rocm: Target AMD GPUs running on the ROCm platform
     * cpu: Target only CPUs
@@ -106,8 +107,8 @@ Options are:
   [default value provided by field 'default-clang' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
   The path to the clang executable that should be used for compilation
-    (Note: *must* be compatible with the clang version that the 
-     hipSYCL clang plugin was compiled against!)
+    (Note: *must* be compatible with the clang version that the
+     AdaptiveCpp clang plugin was compiled against!)
 
 --acpp-nvcxx=<value>
   [can also be set with environment variable: ACPP_NVCXX=<value>]
@@ -201,7 +202,7 @@ Options are:
   [default value provided by field 'default-config-file-dir' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
   Select an alternative path for the config files containing the default AdaptiveCpp settings.
-    It is normally not necessary for the user to change this setting. 
+    It is normally not necessary for the user to change this setting.
 
 --acpp-targets=<value>
   [can also be set with environment variable: ACPP_TARGETS=<value>]
@@ -215,11 +216,11 @@ Options are:
                                    Uses Boost.Fiber for nd_range parallel_for support.
                - omp.accelerated: Uses clang as host compiler to enable compiler support
                                   for nd_range parallel_for (see --acpp-use-accelerated-cpu).
-      * cuda - CUDA backend 
+      * cuda - CUDA backend
                Requires specification of targets of the form sm_XY,
                e.g. sm_70 for Volta, sm_60 for Pascal
                Backend Flavors:
-               - cuda.explicit-multipass: CUDA backend in explicit multipass mode 
+               - cuda.explicit-multipass: CUDA backend in explicit multipass mode
                                           (see --acpp-explicit-multipass)
                - cuda.integrated-multipass: Force CUDA backend to operate in integrated
                                            multipass mode.
@@ -263,8 +264,8 @@ Options are:
   [can also be set by setting environment variable ACPP_DRYRUN to any value other than false|off|0 ]
   [default value provided by field 'default-is-dryrun' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
-  If set, only shows compilation commands that would be executed, 
-  but does not actually execute it. 
+  If set, only shows compilation commands that would be executed,
+  but does not actually execute it.
 
 --acpp-explicit-multipass
   [can also be set by setting environment variable ACPP_EXPLICIT_MULTIPASS to any value other than false|off|0 ]
@@ -273,7 +274,7 @@ Options are:
   If set, executes device passes as separate compiler invocation and lets AdaptiveCpp control embedding device
   images into the host binary. This allows targeting multiple backends simultaneously that might otherwise be
   incompatible. In this mode, source code level interoperability may not be supported in the host pass.
-  For example, you cannot use the CUDA kernel launch syntax[i.e. kernel <<< ... >>> (...)] in this mode. 
+  For example, you cannot use the CUDA kernel launch syntax[i.e. kernel <<< ... >>> (...)] in this mode.
 
 --acpp-save-temps
   [can also be set by setting environment variable ACPP_SAVE_TEMPS to any value other than false|off|0 ]
@@ -313,15 +314,14 @@ Options are:
 Any other options will be forwarded to the compiler.
 
 Note: Command line arguments take precedence over environment variables.
-
 ```
 
 ## Using the CMake integration
-Setting up a project using the AdaptiveCpp CMake integration is quite straight forward.
-The main points are adding `find_package(AdaptiveCpp REQUIRED)` and after defining the targets to build, adding `add_sycl_to_target(TARGET <target_name>)` to have the compilation handled by the AdaptiveCpp toolchain.
-See the [example cmake project](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/examples/CMakeLists.txt).
 
-A typical configure command line looks like this: `cmake .. -DAdaptiveCpp_DIR=/acpp/install/dir/lib/cmake/AdaptiveCpp -DACPP_TARGETS="<targets>"`.
-`ACPP_TARGETS` has to be set either as environment variable or on the command line for the `find_package` call to succeed. See the documentation of this flag above.
+Setting up a project using the AdaptiveCpp CMake integration is fairly straightforward.
+The main points are adding `find_package(AdaptiveCpp REQUIRED)` and, after defining the targets to build, adding `add_sycl_to_target(TARGET <target_name>)` to have the compilation handled by the AdaptiveCpp toolchain (see the [example CMake project](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/examples/CMakeLists.txt)).
+
+A typical configure command might look like this: `cmake .. -DAdaptiveCpp_DIR=/acpp/install/dir/lib/cmake/AdaptiveCpp -DACPP_TARGETS="<targets>"`.
+`ACPP_TARGETS` has to be set either as an environment variable or through the command line for the `find_package` call to succeed. See the documentation of this flag above.
 
 If the accelerated CPU flow has been built, `-DACPP_USE_ACCELERATED_CPU=ON/OFF` can be used to override whether `omp` should refer to the `omp.library-only` or `omp.accelerated` compilation flow.

From 76aa2f18a40ac5f8ca22278d88856320be18a43b Mon Sep 17 00:00:00 2001
From: Arpan <105904949+Arpan3323@users.noreply.github.com>
Date: Tue, 8 Oct 2024 18:34:58 +0000
Subject: [PATCH 023/126] Update vendor and device names with "AdaptiveCpp"

---
 include/hipSYCL/sycl/platform.hpp        | 2 +-
 src/runtime/omp/omp_hardware_manager.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/hipSYCL/sycl/platform.hpp b/include/hipSYCL/sycl/platform.hpp
index 934b2b8fb..9d2046862 100644
--- a/include/hipSYCL/sycl/platform.hpp
+++ b/include/hipSYCL/sycl/platform.hpp
@@ -152,7 +152,7 @@ HIPSYCL_SPECIALIZE_GET_INFO(platform, name)
 
 HIPSYCL_SPECIALIZE_GET_INFO(platform, vendor)
 {
-  return "The hipSYCL project";
+  return "The AdaptiveCpp project";
 }
 
 HIPSYCL_SPECIALIZE_GET_INFO(platform, extensions)
diff --git a/src/runtime/omp/omp_hardware_manager.cpp b/src/runtime/omp/omp_hardware_manager.cpp
index 0adb67ce9..a1d056c73 100644
--- a/src/runtime/omp/omp_hardware_manager.cpp
+++ b/src/runtime/omp/omp_hardware_manager.cpp
@@ -37,11 +37,11 @@ std::size_t omp_hardware_context::get_max_memcpy_concurrency() const {
 }
 
 std::string omp_hardware_context::get_device_name() const {
-  return "hipSYCL OpenMP host device";
+  return "AdaptiveCpp OpenMP host device";
 }
 
 std::string omp_hardware_context::get_vendor_name() const {
-  return "the hipSYCL project";
+  return "the AdaptiveCpp project";
 }
 
 std::string omp_hardware_context::get_device_arch() const {

From 154f50ff00e5167b95de8c5d7907cceb7e58484e Mon Sep 17 00:00:00 2001
From: Marco Julian Solanki <msolanki@student.ethz.ch>
Date: Wed, 9 Oct 2024 13:00:05 +0200
Subject: [PATCH 024/126] Rework index.md

* Fix copy/paste error ("Rust Logo" -> "AdaptiveCpp Logo")

* Fix some dubious punctuation

* Fix some false advertising (acpp can run on "any" -> "most" hardware)
---
 doc/index.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/doc/index.md b/doc/index.md
index 4dd1f7558..dcbbd746b 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -1,7 +1,7 @@
 #
-![The Rust Logo](img/logo/logo-color.png)
+![The AdaptiveCpp Logo](img/logo/logo-color.png)
 
-Welcome to the documentation of the AdaptiveCpp !
+Welcome to the documentation of AdaptiveCpp!
 
 <div class="grid cards" markdown>
 
@@ -9,16 +9,15 @@ Welcome to the documentation of the AdaptiveCpp !
 
     ---
 
-    Configure and install AdaptiveCpp
+    Configure and install AdaptiveCpp.
 
     [:octicons-arrow-right-24: Installation](./installing.md)
 
--   :fontawesome-solid-gears:{ .lg .middle } __Can run on any hardware__
+-   :fontawesome-solid-gears:{ .lg .middle } __Can run on most hardware__
 
     ---
 
-    We support CPUs, GPUs, from all vendors, either through multipass compilation.
-    Or through our single pass SSCP compiler
+    We support CPUs and GPUs from all major vendors, either through multipass compilation or through our single-pass SSCP compiler.
 
     [:octicons-arrow-right-24: Usage](./using-hipsycl.md)
 
@@ -27,4 +26,4 @@ Welcome to the documentation of the AdaptiveCpp !
 
 !!! note
 
-    This documentation webpage is WIP.
\ No newline at end of file
+    This documentation webpage is still work-in-progress.

From de2836ed4861e2683b9b06d9cf640d316ef9512f Mon Sep 17 00:00:00 2001
From: Marco Julian Solanki <173357676+marcosolanki@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:40:05 +0200
Subject: [PATCH 025/126] Clarify "most hardware" to "hardware from all major
 vendors" in index.md

Co-authored-by: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
---
 doc/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/index.md b/doc/index.md
index dcbbd746b..8d622d306 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -13,7 +13,7 @@ Welcome to the documentation of AdaptiveCpp!
 
     [:octicons-arrow-right-24: Installation](./installing.md)
 
--   :fontawesome-solid-gears:{ .lg .middle } __Can run on most hardware__
+-   :fontawesome-solid-gears:{ .lg .middle } __Can run on hardware from all major vendors__
 
     ---
 

From 7bfce4b7094aeb50d0d834fce29839a62b1724b7 Mon Sep 17 00:00:00 2001
From: Marco Julian Solanki <msolanki@student.ethz.ch>
Date: Sun, 13 Oct 2024 15:00:05 +0200
Subject: [PATCH 026/126] Rename using-hipsycl.md to using-acpp.md

---
 README.md                               | 2 +-
 doc/cleanup_syclcchelp.sh               | 2 +-
 doc/compilation.md                      | 2 +-
 doc/index.md                            | 2 +-
 doc/{using-hipsycl.md => using-acpp.md} | 0
 mkdocs.yml                              | 4 ++--
 6 files changed, 6 insertions(+), 6 deletions(-)
 rename doc/{using-hipsycl.md => using-acpp.md} (100%)

diff --git a/README.md b/README.md
index 9b2168c6a..4947bc70a 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ In order to compile software with AdaptiveCpp, use `acpp`. `acpp` can be used li
 
 `acpp` accepts both command line arguments and environment variables to configure its behavior (e.g., to select the target to compile for). See `acpp --help` for a comprehensive list of options.
 
-For details and instructions on using AdaptiveCpp in CMake projects, please see the documentation on [using AdaptiveCpp](doc/using-hipsycl.md).
+For details and instructions on using AdaptiveCpp in CMake projects, please see the documentation on [using AdaptiveCpp](doc/using-acpp.md).
 
 
 ## About the project
diff --git a/doc/cleanup_syclcchelp.sh b/doc/cleanup_syclcchelp.sh
index a08e296c9..6992c7092 100755
--- a/doc/cleanup_syclcchelp.sh
+++ b/doc/cleanup_syclcchelp.sh
@@ -1,7 +1,7 @@
 #! /bin/bash
 
 # usage: acpp --help | ./cleanup_syclcchelp.sh
-# output is in sylccout then and should be copied into using-hipsycl.md
+# output is in sylccout then and should be copied into using-acpp.md
 
 sed "s/\[current value: .*\]/[current value: NOT SET]/g" > acppout
 
diff --git a/doc/compilation.md b/doc/compilation.md
index 23507c975..77f0e7e65 100644
--- a/doc/compilation.md
+++ b/doc/compilation.md
@@ -115,7 +115,7 @@ approach is employed to achieve good performance and functional correctness (_Ka
 A deep dive into how the implementation works and why this approach was chosen
 can be found in Joachim Meyer's [master thesis](https://joameyer.de/hipsycl/Thesis_JoachimMeyer.pdf).
 
-For more details, see the [installation instructions](installing.md) and the documentation [using AdaptiveCpp](using-hipsycl.md).
+For more details, see the [installation instructions](installing.md) and the documentation [using AdaptiveCpp](using-acpp.md).
 
 ## acpp compilation driver
 
diff --git a/doc/index.md b/doc/index.md
index 8d622d306..52ce492e5 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -19,7 +19,7 @@ Welcome to the documentation of AdaptiveCpp!
 
     We support CPUs and GPUs from all major vendors, either through multipass compilation or through our single-pass SSCP compiler.
 
-    [:octicons-arrow-right-24: Usage](./using-hipsycl.md)
+    [:octicons-arrow-right-24: Usage](./using-acpp.md)
 
 
 </div>
diff --git a/doc/using-hipsycl.md b/doc/using-acpp.md
similarity index 100%
rename from doc/using-hipsycl.md
rename to doc/using-acpp.md
diff --git a/mkdocs.yml b/mkdocs.yml
index 915b75fa0..fbfc2b3b6 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -10,7 +10,7 @@ nav:
 
 
   - 'Usage' :
-      - 'Using AdaptiveCpp' : 'using-hipsycl.md'
+      - 'Using AdaptiveCpp' : 'using-acpp.md'
       - 'Compilation model' : 'compilation.md'
       - 'Env variables' : 'env_variables.md'
       - 'Performance guide' : 'performance.md'
@@ -73,4 +73,4 @@ markdown_extensions:
 
   - admonition
   - pymdownx.details
-  - pymdownx.superfences
\ No newline at end of file
+  - pymdownx.superfences

From 2950f49e3dab28a1c58f76f185a9393503b19d9b Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 15 Oct 2024 23:24:48 +0200
Subject: [PATCH 027/126] [stdpar] Add initial merge support via merge path
 algorithm

---
 include/hipSYCL/algorithms/algorithm.hpp      |  32 ++-
 .../algorithms/binary_search/index_search.hpp |  74 +++++++
 include/hipSYCL/algorithms/merge/merge.hpp    | 130 +++++++++++++
 .../hipSYCL/algorithms/merge/merge_path.hpp   | 184 ++++++++++++++++++
 .../stdpar/detail/offload_heuristic_db.hpp    |   1 +
 .../std/stdpar/pstl-impl/algorithm.hpp        | 139 +++++++++++++
 tests/CMakeLists.txt                          |   1 +
 tests/pstl/merge.cpp                          | 146 ++++++++++++++
 tests/pstl/pstl_test_suite.hpp                |   4 +
 9 files changed, 709 insertions(+), 2 deletions(-)
 create mode 100644 include/hipSYCL/algorithms/binary_search/index_search.hpp
 create mode 100644 include/hipSYCL/algorithms/merge/merge.hpp
 create mode 100644 include/hipSYCL/algorithms/merge/merge_path.hpp
 create mode 100644 tests/pstl/merge.cpp

diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index 1f6633dcd..88148e1a9 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -21,10 +21,12 @@
 #include "hipSYCL/sycl/libkernel/functional.hpp"
 #include "hipSYCL/sycl/event.hpp"
 #include "hipSYCL/sycl/queue.hpp"
+#include "merge/merge.hpp"
 #include "util/traits.hpp"
 #include "hipSYCL/algorithms/util/allocation_cache.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
 #include "hipSYCL/algorithms/sort/bitonic_sort.hpp"
+#include "hipSYCL/algorithms/merge/merge.hpp"
 
 namespace hipsycl::algorithms {
 
@@ -454,14 +456,40 @@ sycl::event none_of(sycl::queue &q,
 }
 
 template <class RandomIt, class Compare>
-void sort(sycl::queue &q, RandomIt first, RandomIt last,
-          Compare comp = std::less<>{}) {
+sycl::event sort(sycl::queue &q, RandomIt first, RandomIt last,
+                 Compare comp = std::less<>{}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
   
   return sorting::bitonic_sort(q, first, last, comp);
 }
+
+template< class ForwardIt1, class ForwardIt2,
+          class ForwardIt3, class Compare >
+sycl::event merge(sycl::queue& q,
+                  util::allocation_group &scratch_allocations,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp = std::less<>{},
+                  const std::vector<sycl::event>& deps = {}) {
+
+  std::size_t size1 =  std::distance(first1, last1);
+  std::size_t size2 =  std::distance(first2, last2);
+
+  if(size1 == 0)
+    return copy(q, first2, last2, d_first);
+  if(size2 == 0)
+    return copy(q, first1, last1, d_first);
+
+  std::size_t problem_size = size1 + size2;
+  if(problem_size == 0)
+    return sycl::event{};
+
+  return merging::segmented_merge(q, first1, last1, first2, last2, d_first,
+                                  comp);
+}
+
 }
 
 #endif
diff --git a/include/hipSYCL/algorithms/binary_search/index_search.hpp b/include/hipSYCL/algorithms/binary_search/index_search.hpp
new file mode 100644
index 000000000..701278f73
--- /dev/null
+++ b/include/hipSYCL/algorithms/binary_search/index_search.hpp
@@ -0,0 +1,74 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_INDEX_SEARCH_HPP
+#define ACPP_ALGORITHMS_INDEX_SEARCH_HPP
+
+#include <type_traits>
+
+namespace hipsycl::algorithms::binary_searching {
+
+// Same as std::lower_bound, but works in terms of indices
+template< class IndexT, class T, class DataGetter,
+          class Compare >
+constexpr IndexT index_lower_bound( IndexT first, IndexT last,
+                                    const T& value, DataGetter load, Compare comp ) {
+  using SignedIndexT = typename std::make_signed<IndexT>::type;
+
+  IndexT current;
+  SignedIndexT count, step;
+  count = last - first;
+
+  while (count > 0) {
+    current = first;
+    step = count / 2;
+    current += step;
+
+    if (comp(load(current), value)) {
+      first = ++current;
+      count -= step + 1;
+    } else
+      count = step;
+  }
+
+  return first;
+}
+
+
+// Same as std::upper_bound, but works in terms of indices
+template< class IndexT, class T, class DataGetter,
+          class Compare >
+constexpr IndexT index_upper_bound( IndexT first, IndexT last,
+                                    const T& value, DataGetter load, Compare comp ) {
+  using SignedIndexT = typename std::make_signed<IndexT>::type;
+
+  IndexT current;
+  SignedIndexT count, step;
+  count = last - first;
+
+  while (count > 0) {
+    current = first;
+    step = count / 2;
+    current += step;
+
+    if (!comp(value, load(current))) {
+      first = ++current;
+      count -= step + 1;
+    } else
+      count = step;
+  }
+
+  return first;
+}
+
+}
+
+#endif
diff --git a/include/hipSYCL/algorithms/merge/merge.hpp b/include/hipSYCL/algorithms/merge/merge.hpp
new file mode 100644
index 000000000..f8e58e0c3
--- /dev/null
+++ b/include/hipSYCL/algorithms/merge/merge.hpp
@@ -0,0 +1,130 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_MERGE_HPP
+#define ACPP_ALGORITHMS_MERGE_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+#include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
+
+#include "merge_path.hpp"
+
+namespace hipsycl::algorithms::merging {
+
+namespace detail {
+
+template <class ForwardIt1, class ForwardIt2, class OutputIt, class Compare, class Size>
+void sequential_merge(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
+                      ForwardIt2 last2, OutputIt out, Compare comp, Size max_num_merged) {
+
+  auto initial_out = out;
+  auto copy_remaining = [&](auto first, auto last) {
+    for (; first != last && (std::distance(initial_out, out) < max_num_merged);
+         ++first, ++out)
+      *out = *first;
+  };
+
+  for (; first1 != last1 && (std::distance(initial_out, out) < max_num_merged);
+       ++out) {
+    if(first2 == last2) {
+      copy_remaining(first1, last1);
+      return;
+    } else {
+      auto f1 = *first1;
+      auto f2 = *first2;
+      if(comp(f1, f2)) {
+        *out = f1;
+        ++first1;
+      } else {
+        *out = f2;
+        ++first2;
+      }
+    }
+  }
+  copy_remaining(first2, last2);
+}
+
+
+/// Decomposes the problem into N independent merges of given size, and
+/// then runs sequential merge on them. This might be a good strategy on CPU.
+///
+/// Precondition: distance(fist1, last1) > 0 && distance(first2, last2) > 0.
+/// Otherwise we cannot run the merge path algorithm for decomposing the merge.
+template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
+void segmented_merge(
+    RandomIt1 first1, RandomIt1 last1, RandomIt2 first2, RandomIt2 last2,
+    OutputIt out, Compare comp, std::size_t partition_index,
+    std::size_t partition_chunk_size) {
+
+  std::size_t p1 = 0;
+  std::size_t p2 = 0;
+
+  merge_path::nth_independent_merge_begin(first1, last1, first2, last2, comp,
+                                          partition_index,
+                                          partition_chunk_size, p1, p2);
+
+  auto chunk_first1 = first1;
+  auto chunk_first2 = first2;
+
+  std::advance(chunk_first1, p1);
+  std::advance(chunk_first2, p2);
+
+  auto chunk_last1 = chunk_first1;
+  auto chunk_last2 = chunk_first2;
+
+  std::advance(chunk_last1, std::min(partition_chunk_size,
+                              std::distance(first1, last1) - p1));
+  std::advance(chunk_last2, std::min(partition_chunk_size,
+                              std::distance(first2, last2) - p2));
+
+  std::size_t chunk_out_offset = partition_index * partition_chunk_size;
+  auto chunk_out = out;
+  std::advance(chunk_out, chunk_out_offset);
+
+  sequential_merge(chunk_first1, chunk_last1, chunk_first2, chunk_last2,
+                    chunk_out, comp, partition_chunk_size);
+}
+
+}
+
+/// Precondition: distance(fist1, last1) > 0 && distance(first2, last2) > 0.
+/// Otherwise we cannot run the merge path algorithm for decomposing the merge.
+template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
+sycl::event segmented_merge(sycl::queue &q, RandomIt1 first1, RandomIt1 last1,
+                            RandomIt2 first2, RandomIt2 last2, OutputIt out,
+                            Compare comp,
+                            std::size_t partition_chunk_size = 128,
+                            const std::vector<sycl::event> &deps = {}) {
+
+  //detail::print_merge_matrix(first1, last1, first2, last2, comp);
+
+  std::size_t problem_size = merge_path::num_partitions(
+      first1, last1, first2, last2, partition_chunk_size);
+
+  if(problem_size == 0)
+    return sycl::event{};
+
+  return q.parallel_for(sycl::range{problem_size}, deps, [=](sycl::id<1> idx) {
+    detail::segmented_merge(first1, last1, first2, last2, out, comp, idx.get(0),
+                            partition_chunk_size);
+  });
+}
+}
+
+
+
+
+
+#endif
diff --git a/include/hipSYCL/algorithms/merge/merge_path.hpp b/include/hipSYCL/algorithms/merge/merge_path.hpp
new file mode 100644
index 000000000..ac5ffdefe
--- /dev/null
+++ b/include/hipSYCL/algorithms/merge/merge_path.hpp
@@ -0,0 +1,184 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_MERGE_PATH_HPP
+#define ACPP_ALGORITHMS_MERGE_PATH_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+#include "../binary_search/index_search.hpp"
+
+namespace hipsycl::algorithms::merging {
+
+
+/// This implements the merge path algorithm, which can be used to decompose a merge
+/// into N disjoint, independent merges which can be run in parallel. For details, see
+/// Green et al. (2014): Merge Path - A Visually Intuitive Approach to Parallel Merging
+/// https://arxiv.org/pdf/1406.2628
+class merge_path {
+public:
+  template <class ForwardIt1, class ForwardIt2, class Compare, class Size>
+  static void
+  nth_independent_merge_begin(ForwardIt1 first1, ForwardIt1 last1,
+                              ForwardIt2 first2, ForwardIt2 last2, Compare comp,
+                              Size partition_index, Size partition_chunk_size,
+                              Size &array1_pos_out, Size &array2_pos_out) {
+
+    Size input1_size = static_cast<Size>(std::distance(first1, last1));
+    Size input2_size = static_cast<Size>(std::distance(first2, last2));
+
+    binary_diag_search(first1, last1, first2, last2, comp, input1_size,
+                       input2_size, partition_index * partition_chunk_size, array1_pos_out,
+                       array2_pos_out);
+  }
+
+  template <class ForwardIt1, class ForwardIt2, class Size>
+  static constexpr Size
+  num_partitions(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
+                 ForwardIt2 last2, Size partition_chunk_size) {
+    Size input1_size = static_cast<Size>(std::distance(first1, last1));
+    Size input2_size = static_cast<Size>(std::distance(first2, last2));
+
+    auto num_diags = total_num_diags(input1_size, input2_size);
+
+    return (num_diags + partition_chunk_size - 1) / partition_chunk_size;
+  }
+
+private:
+  template<class ForwardIt, class Size>
+  static auto load(ForwardIt first, Size idx) {
+    std::advance(first, idx);
+    return *first;
+  }
+
+  template<class ForwardIt, class T, class Size>
+  static void store(ForwardIt first, Size idx, const T& val) {
+    std::advance(first, idx);
+    *first = val;
+  }
+
+
+  // Total number of left-bottom-top-right diagonals of the AB matrix
+  template<class Size>
+  static constexpr Size total_num_diags(Size size1, Size size2) {
+    // There are size1 + size2 - 1 "real" diags, but we need an additional diagonal 0 before
+    // the actual data
+    return size1 + size2;
+  }
+
+  template <class ForwardIt1, class ForwardIt2, class Compare, class Size>
+  static void
+  binary_diag_search(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
+                     ForwardIt2 last2, Compare comp,
+                     Size size1, Size size2,
+                     Size diag_index, Size &array1_index_out,
+                     Size &array2_index_out) {
+    
+    if(size1 <= 1 && size2 <= 1) {
+      array1_index_out = 0;
+      array2_index_out = 0;
+      return;
+    }
+
+    Size dlen = diag_length(size1, size2, diag_index);
+    
+
+    if(dlen <= 1) {
+      array1_index_out = 0;
+      array2_index_out = 0;
+      return;
+    }
+
+    // The idea behind the merge path algorithm is to create the merge matrix, where the
+    // [i][j] entries are 1 exactly if comp(first1[i],first2[j]) == true, and 0
+    // otherwise. This matrix will have a contiguous region of zeroes at the
+    // top, the rest will be 1. We can then find the merge path by finding the
+    // highest value where the cross-diagonals in the matrix are 1 using binary
+    // search. Since we only ever care about the merge matrix when binary
+    // searching on the diagonal, this function generates entries from the merge
+    // matrix on-the-fly with just one parameter: the current position on the
+    // diagonal.
+    auto data_loader = [&](auto idx) {
+      auto idx1 = array1_idx_from_diag(size1, size2, diag_index, idx);
+      auto idx2 = array2_idx_from_diag(size1, size2, diag_index, idx);
+
+      // Due to arcane reasons that cannot be expressed in mere mortal words,
+      // the merge matrix needs to be shifted by -1 in the first dimension.
+      // This was revealed to my in a dream.
+      auto v1 = load(first1, idx1 == 0 ? 0 : idx1 - 1);
+      auto v2 = load(first2, idx2);
+      
+      bool res = comp(v1, v2);
+      return static_cast<int>(res);
+    };
+
+    auto compare = [&](int v1, int v2) {
+      // Note: Do NOT use comp() here, since this is used to compare entries
+      // in the merge matrix (which can only be 1 or 0 as generated by data_loader),
+      // not used to compare elements of the user data array!
+      return v1 < v2;
+    };
+
+    // Run binary serach across the index space [0, dlen) to find the first 1
+    // from top to bottom on the current diagonal
+    auto idx = binary_searching::index_upper_bound(Size{0}, dlen, 0,
+                                                   data_loader, compare);
+
+    array1_index_out = array1_idx_from_diag(size1, size2, diag_index, idx);
+    array2_index_out = array2_idx_from_diag(size1, size2, diag_index, idx);
+  }
+
+  template <class Size>
+  static constexpr Size diag_length(Size size1, Size size2, Size diag_idx) {
+    if(diag_idx < size1 && diag_idx < size2)
+      return diag_idx;
+
+    auto min_size = std::min(size1, size2);
+    auto max_size = std::max(size1, size2);
+
+    if(diag_idx >= min_size && diag_idx <= max_size)
+      return min_size;
+
+    return total_num_diags(size1, size2) - diag_idx;
+  }
+
+  // position on diag is incremented from the top of the matrix to the bottom.
+  template <class Size>
+  static constexpr Size array1_idx_from_diag(Size size1, Size size2,
+                                             Size diag_idx,
+                                             Size position_on_diag) {
+    // Note: We need to use size and *not* size-1 in this expression.
+    // The position must be able to become invalid so that we can express when
+    // we only need elements from array2 for the merge and we have left array1.
+    auto diag_start = std::min(diag_idx, size1);
+    return diag_start - position_on_diag;
+  }
+
+  template <class Size>
+  static constexpr Size array2_idx_from_diag(Size size1, Size size2,
+                                             Size diag_idx,
+                                             Size position_on_diag) {
+    if(diag_idx <= size1)
+      return position_on_diag;
+    return diag_idx - size1 + position_on_diag;
+  }
+};
+
+
+
+
+}
+
+
+
+#endif
diff --git a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
index 9c65ae2e6..2b8331ce4 100644
--- a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
+++ b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
@@ -51,6 +51,7 @@ struct all_of {};
 struct any_of {};
 struct none_of {};
 struct sort {};
+struct merge {};
 
 
 struct transform_reduce {};
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
index 9e59c815d..e4c3f611b 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
@@ -527,6 +527,74 @@ HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par_unseq, RandomIt first,
 }
 
 
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par_unseq,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first, comp);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_unseq_host_fallback, first1, last1,
+                      first2, last2, d_first, comp);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par_unseq{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first, comp);
+}
+
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par_unseq,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_unseq_host_fallback, first1, last1,
+                      first2, last2, d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par_unseq{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first);
+}
+
 
 //////////////////// par policy  /////////////////////////////////////
 
@@ -1027,6 +1095,77 @@ HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par, RandomIt first,
       std::distance(first, last), offloader, fallback, first,
       HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), comp);
 }
+
+
+
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first, comp);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_unseq_host_fallback, first1, last1,
+                      first2, last2, d_first, comp);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par_unseq{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first, comp);
+}
+
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_host_fallback, first1, last1,
+                      first2, last2, d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first);
+}
+
 }
 
 #endif
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index cbcbd1771..b813c47c0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -141,6 +141,7 @@ if(WITH_PSTL_TESTS)
     pstl/generate.cpp
     pstl/generate_n.cpp
     pstl/memory.cpp
+    pstl/merge.cpp
     pstl/none_of.cpp
     pstl/reduce.cpp
     pstl/replace.cpp
diff --git a/tests/pstl/merge.cpp b/tests/pstl/merge.cpp
new file mode 100644
index 000000000..fd741946a
--- /dev/null
+++ b/tests/pstl/merge.cpp
@@ -0,0 +1,146 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <algorithm>
+#include <cstdlib>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <functional>
+#include <random>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/mpl/list.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_merge, enable_unified_shared_memory)
+
+template <class Policy, class Generator1, class Generator2,
+          class Comp = std::less<>>
+void test_merge(Policy &&pol, std::size_t size1, std::size_t size2,
+                Generator1 gen1, Generator2 gen2, Comp comp = {}) {
+  std::vector<int> data1(size1);
+  std::vector<int> data2(size2);
+  std::vector<int> out(size1+size2);
+
+  for(int i = 0; i < size1; ++i)
+    data1[i] = gen1(i);
+  for(int i = 0; i < size2; ++i)
+    data2[i] = gen2(i);
+  std::vector<int> host_out = out;
+
+  auto ret = std::merge(pol, data1.begin(), data1.end(), data2.begin(), data2.end(),
+             out.begin(), comp);
+  auto host_ret = std::merge(data1.begin(), data1.end(), data2.begin(), data2.end(),
+             host_out.begin(), comp);
+
+  BOOST_CHECK(host_out == out);
+  BOOST_CHECK(ret == out.begin() + std::distance(host_out.begin(), host_ret));
+
+  for(int i = 0; i < out.size(); ++i) {
+    auto expected = host_out[i];
+    auto result = out[i];
+    if(result != expected)
+      std::cout << i << ": " << expected << " != " << result << std::endl;
+  }
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  test_merge(
+      std::execution::par_unseq, 0, 0, [](int i) { return 0; },
+      [](int i) { return 0; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  test_merge(
+      std::execution::par_unseq, 1, 0, [](int i) { return 2; },
+      [](int i) { return 3; });
+  test_merge(
+      std::execution::par_unseq, 0, 1, [](int i) { return 2; },
+      [](int i) { return 3; });
+  test_merge(
+      std::execution::par_unseq, 1, 1, [](int i) { return 2; },
+      [](int i) { return 3; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_trivial_merge) {
+
+  auto a = [](int i) { return i; };
+  auto b = [](int i) { return i + 1024; };
+
+  test_merge(
+      std::execution::par_unseq, 1024, 1024, a, b);
+
+  test_merge(
+      std::execution::par_unseq, 1024, 1024, b, a);
+}
+
+std::vector<int> generate_sorted_random_numbers(int amount, int seed=123) {
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<int> dist;
+
+  std::vector<int> data;
+  for(int i = 0; i < amount; ++i)
+    data.push_back(dist(gen));
+  std::sort(data.begin(), data.end());
+  return data;
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_same_size) {
+  std::size_t s1 = 256;
+  std::size_t s2 = 256;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 42);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_same_data) {
+  std::size_t s1 = 1024;
+  std::size_t s2 = 1024;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 123);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_v1_larger) {
+  std::size_t s1 = 1932;
+  std::size_t s2 = 1000;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 42);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_v2_larger) {
+  std::size_t s1 = 1000;
+  std::size_t s2 = 1932;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 42);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/pstl/pstl_test_suite.hpp b/tests/pstl/pstl_test_suite.hpp
index 322696498..dcbfa1975 100644
--- a/tests/pstl/pstl_test_suite.hpp
+++ b/tests/pstl/pstl_test_suite.hpp
@@ -15,11 +15,15 @@
 
 struct enable_unified_shared_memory {
   enable_unified_shared_memory() {
+#ifndef __ACPP_STDPAR_ASSUME_SYSTEM_USM__
     hipsycl::stdpar::unified_shared_memory::pop_disabled();
+#endif
   }
 
   ~enable_unified_shared_memory() {
+#ifndef __ACPP_STDPAR_ASSUME_SYSTEM_USM__
     hipsycl::stdpar::unified_shared_memory::push_disabled();
+#endif
   }
 };
 

From c64a0018eddcb85d978360ed512ec671940dfa30 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 24 Oct 2024 15:50:11 +0200
Subject: [PATCH 028/126] [stdpar] Add hierarchical hybrid merge

---
 doc/stdpar.md                                 |   1 +
 include/hipSYCL/algorithms/algorithm.hpp      |   7 +-
 include/hipSYCL/algorithms/merge/merge.hpp    | 207 +++++++++++++++++-
 .../hipSYCL/algorithms/merge/merge_path.hpp   |   6 +-
 .../hipSYCL/algorithms/sort/bitonic_sort.hpp  |  28 +++
 5 files changed, 232 insertions(+), 17 deletions(-)

diff --git a/doc/stdpar.md b/doc/stdpar.md
index a54683be4..4ff93e9fd 100644
--- a/doc/stdpar.md
+++ b/doc/stdpar.md
@@ -42,6 +42,7 @@ Offloading is implemented for the following STL algorithms:
 |`any_of` | |
 |`all_of` | |
 |`none_of` | |
+|`merge` | |
 |`sort` | |
 
 
diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index 88148e1a9..1e5509000 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -486,8 +486,11 @@ sycl::event merge(sycl::queue& q,
   if(problem_size == 0)
     return sycl::event{};
 
-  return merging::segmented_merge(q, first1, last1, first2, last2, d_first,
-                                  comp);
+  //return merging::segmented_merge(q, first1, last1, first2, last2, d_first,
+  //                                comp);
+
+  return merging::hierarchical_hybrid_merge(q, scratch_allocations, first1,
+                                            last1, first2, last2, d_first, comp);
 }
 
 }
diff --git a/include/hipSYCL/algorithms/merge/merge.hpp b/include/hipSYCL/algorithms/merge/merge.hpp
index f8e58e0c3..76652f77d 100644
--- a/include/hipSYCL/algorithms/merge/merge.hpp
+++ b/include/hipSYCL/algorithms/merge/merge.hpp
@@ -17,8 +17,10 @@
 #include <iterator>
 
 #include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/sycl/libkernel/nd_item.hpp"
 #include "hipSYCL/algorithms/util/allocation_cache.hpp"
 
+#include "../sort/bitonic_sort.hpp"
 #include "merge_path.hpp"
 
 namespace hipsycl::algorithms::merging {
@@ -65,15 +67,15 @@ void sequential_merge(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
 template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
 void segmented_merge(
     RandomIt1 first1, RandomIt1 last1, RandomIt2 first2, RandomIt2 last2,
-    OutputIt out, Compare comp, std::size_t partition_index,
-    std::size_t partition_chunk_size) {
+    OutputIt out, Compare comp, std::size_t segment_index,
+    std::size_t segment_chunk_size) {
 
   std::size_t p1 = 0;
   std::size_t p2 = 0;
 
   merge_path::nth_independent_merge_begin(first1, last1, first2, last2, comp,
-                                          partition_index,
-                                          partition_chunk_size, p1, p2);
+                                          segment_index,
+                                          segment_chunk_size, p1, p2);
 
   auto chunk_first1 = first1;
   auto chunk_first2 = first2;
@@ -84,19 +86,134 @@ void segmented_merge(
   auto chunk_last1 = chunk_first1;
   auto chunk_last2 = chunk_first2;
 
-  std::advance(chunk_last1, std::min(partition_chunk_size,
+  std::advance(chunk_last1, std::min(segment_chunk_size,
                               std::distance(first1, last1) - p1));
-  std::advance(chunk_last2, std::min(partition_chunk_size,
+  std::advance(chunk_last2, std::min(segment_chunk_size,
                               std::distance(first2, last2) - p2));
 
-  std::size_t chunk_out_offset = partition_index * partition_chunk_size;
+  std::size_t chunk_out_offset = segment_index * segment_chunk_size;
   auto chunk_out = out;
   std::advance(chunk_out, chunk_out_offset);
 
   sequential_merge(chunk_first1, chunk_last1, chunk_first2, chunk_last2,
-                    chunk_out, comp, partition_chunk_size);
+                    chunk_out, comp, segment_chunk_size);
 }
 
+template <class RandomIt1, class RandomIt2, class Compare,
+          class IndexT>
+void store_segment_begin(RandomIt1 first1, RandomIt1 last1, RandomIt2 first2,
+                         RandomIt2 last2, Compare comp,
+                         IndexT segment_index,
+                         IndexT segment_size, 
+                         IndexT* first_out1, IndexT* first_out2,
+                         std::size_t offset = 0 // Will be added to result
+                         ) {
+
+  auto problem_size1 = std::distance(first1, last1);
+  auto problem_size2 = std::distance(first2, last2);
+
+  if(problem_size1 == 0) {
+    first_out1[segment_index] = offset + 0;
+    first_out2[segment_index] = offset + segment_index * segment_size;
+  } else if(problem_size2 == 0) {
+    first_out2[segment_index] = offset + 0;
+    first_out1[segment_index] = offset + segment_index * segment_size;
+  } else {
+
+    IndexT p1 = 0;
+    IndexT p2 = 0;
+
+    merge_path::nth_independent_merge_begin(first1, last1, first2, last2, comp,
+                                            segment_index,
+                                            segment_size, p1, p2);
+
+    first_out1[segment_index] = p1 + offset;
+    first_out2[segment_index] = p2 + offset;
+  }
+}
+
+template <class RandomIt1, class RandomIt2, class OutputIt, class IndexT,
+          class Group, class Compare>
+void segment_merge_by_group_sort(
+    Group grp, // SYCL group object. Group size must correspond to segment size,
+               // grp id must correspond to segment index.
+    RandomIt1 first1, // Iterators describing the *whole* merge range, not just
+                      // this group
+    RandomIt1 last1, RandomIt2 first2, RandomIt2 last2, OutputIt out,
+    Compare comp, IndexT *segments_begin1, IndexT *segments_begin2,
+    IndexT num_segments,
+    typename std::iterator_traits<RandomIt1>::value_type *local_mem = nullptr) {
+
+  int lid = grp.get_local_linear_id();
+  auto grp_id = grp.get_group_linear_id();
+  int grp_size = grp.get_local_linear_range();
+
+  std::size_t segment_begin1 = segments_begin1[grp_id];
+  std::size_t segment_begin2 = segments_begin2[grp_id];
+
+  RandomIt1 group_first1 = first1;
+  std::advance(group_first1, segment_begin1);
+  RandomIt2 group_first2 = first2;
+  std::advance(group_first2, segment_begin2);
+
+  std::size_t segment_end1 =
+      std::distance(group_first1, last1) + segment_begin1;
+  std::size_t segment_end2 =
+      std::distance(group_first2, last2) + segment_begin2;
+  if(grp_id < num_segments - 1) {
+    segment_end1 = segments_begin1[grp_id + 1];
+    segment_end2 = segments_begin2[grp_id + 1];
+  }
+
+  RandomIt1 group_last1 = first1;
+  std::advance(group_last1, segment_end1);
+
+  RandomIt2 group_last2 = first2;
+  std::advance(group_last2, segment_end2);
+
+  OutputIt group_out = out;
+  std::advance(group_out, grp_id * grp_size);
+  
+  int input_size1 = std::distance(group_first1, group_last1);
+  int input_size2 = std::distance(group_first2, group_last2);
+  auto local_problem_size = input_size1 + input_size2;
+
+  auto load = [](auto it, auto idx) {
+    std::advance(it, idx);
+    return *it;
+  };
+
+  auto store = [](auto it, auto idx, auto v) {
+    std::advance(it, idx);
+    *it = v;
+  };
+
+  auto barrier = [&](){
+    sycl::group_barrier(grp);
+  };
+
+  if(local_mem) {
+    if(lid < input_size1)
+      local_mem[lid] = load(group_first1, lid);
+    if(lid < input_size2)
+      local_mem[lid + input_size1] = load(group_first2, lid);
+    
+    barrier();
+    sorting::bitonic_group_sort(local_mem, grp_size, local_problem_size,
+                                lid, barrier, comp);
+    if(lid < local_problem_size)
+      store(group_out, lid, local_mem[lid]);
+  } else {
+    if(lid < input_size1)
+      store(group_out, lid, load(group_first1, lid));
+    if(lid < input_size2)
+      store(group_out, lid + input_size1, load(group_first2, lid));
+
+    barrier();
+    sorting::bitonic_group_sort(group_out, grp_size, local_problem_size,
+                                lid, barrier, comp);
+  }
+}
 }
 
 /// Precondition: distance(fist1, last1) > 0 && distance(first2, last2) > 0.
@@ -105,22 +222,88 @@ template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
 sycl::event segmented_merge(sycl::queue &q, RandomIt1 first1, RandomIt1 last1,
                             RandomIt2 first2, RandomIt2 last2, OutputIt out,
                             Compare comp,
-                            std::size_t partition_chunk_size = 128,
+                            std::size_t segment_chunk_size = 128,
                             const std::vector<sycl::event> &deps = {}) {
 
   //detail::print_merge_matrix(first1, last1, first2, last2, comp);
 
-  std::size_t problem_size = merge_path::num_partitions(
-      first1, last1, first2, last2, partition_chunk_size);
+  std::size_t problem_size = merge_path::num_independent_merges(
+      first1, last1, first2, last2, segment_chunk_size);
 
   if(problem_size == 0)
     return sycl::event{};
 
   return q.parallel_for(sycl::range{problem_size}, deps, [=](sycl::id<1> idx) {
     detail::segmented_merge(first1, last1, first2, last2, out, comp, idx.get(0),
-                            partition_chunk_size);
+                            segment_chunk_size);
   });
 }
+
+// Assumes that distance(first1,last1)!=0 && distance(first2,last2)!=0
+template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
+sycl::event
+hierarchical_hybrid_merge(sycl::queue &q, util::allocation_group &scratch,
+                          RandomIt1 first1, RandomIt1 last1, RandomIt2 first2,
+                          RandomIt2 last2, OutputIt out, Compare comp,
+                          std::size_t segment_chunk_size = 128,
+                          const std::vector<sycl::event> &deps = {}) {
+
+  //detail::print_merge_matrix(first1, last1, first2, last2, comp);
+  
+  std::size_t num_merges = merge_path::num_independent_merges(
+      first1, last1, first2, last2, segment_chunk_size);
+  std::size_t* segment_start_scratch = scratch.obtain<std::size_t>(2 * num_merges);
+
+  std::size_t* segment_start_scratch1 = segment_start_scratch;
+  std::size_t* segment_start_scratch2 = segment_start_scratch + num_merges;
+
+  if(num_merges == 0)
+    return sycl::event{};
+
+  sycl::event store_segment_begin_evt =
+      q.parallel_for(sycl::range{num_merges}, deps, [=](sycl::id<1> idx) {
+        detail::store_segment_begin(
+            first1, last1, first2, last2, comp, idx.get(0), segment_chunk_size,
+            segment_start_scratch1, segment_start_scratch2);
+      });
+  
+  std::size_t group_size = segment_chunk_size;
+
+  auto deps2 = deps;
+  if(!q.is_in_order())
+    deps2.push_back(store_segment_begin_evt);
+
+  sycl::event group_sort_evt;
+  
+  using T = typename std::iterator_traits<RandomIt1>::value_type;
+  // TODO: Better to actually check local mem capacity
+  if(sizeof(*first1) <= 16) {
+    group_sort_evt = q.submit([&](sycl::handler& cgh) {
+
+      sycl::local_accessor<T> local_mem {group_size, cgh};
+
+      cgh.depends_on(deps2);
+      cgh.parallel_for(sycl::nd_range<1>{num_merges * group_size, group_size},
+                       [=](sycl::nd_item<1> idx) {
+                         detail::segment_merge_by_group_sort(
+                             idx.get_group(), first1, last1, first2, last2, out,
+                             comp, segment_start_scratch1,
+                             segment_start_scratch2, num_merges,
+                             &(local_mem[0]));
+                       });
+    });
+  } else {
+    group_sort_evt = q.parallel_for(
+      sycl::nd_range<1>{num_merges * group_size, group_size}, deps2,
+      [=](sycl::nd_item<1> idx) {
+        detail::segment_merge_by_group_sort(idx.get_group(), first1, last1, first2,
+                                    last2, out, comp, segment_start_scratch1,
+                                    segment_start_scratch2, num_merges);
+      });
+  }
+
+  return group_sort_evt;
+}
 }
 
 
diff --git a/include/hipSYCL/algorithms/merge/merge_path.hpp b/include/hipSYCL/algorithms/merge/merge_path.hpp
index ac5ffdefe..0b503dbc2 100644
--- a/include/hipSYCL/algorithms/merge/merge_path.hpp
+++ b/include/hipSYCL/algorithms/merge/merge_path.hpp
@@ -44,14 +44,14 @@ class merge_path {
 
   template <class ForwardIt1, class ForwardIt2, class Size>
   static constexpr Size
-  num_partitions(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
-                 ForwardIt2 last2, Size partition_chunk_size) {
+  num_independent_merges(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
+                 ForwardIt2 last2, Size segment_chunk_size) {
     Size input1_size = static_cast<Size>(std::distance(first1, last1));
     Size input2_size = static_cast<Size>(std::distance(first2, last2));
 
     auto num_diags = total_num_diags(input1_size, input2_size);
 
-    return (num_diags + partition_chunk_size - 1) / partition_chunk_size;
+    return (num_diags + segment_chunk_size - 1) / segment_chunk_size;
   }
 
 private:
diff --git a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
index 03aaf83be..ed0b838a2 100644
--- a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
+++ b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
@@ -37,6 +37,33 @@ inline bool can_compare(std::size_t left_id, std::size_t right_id,
 
 } //detail
 
+template <class RandomIt, class SizeT, class Barrier, class Compare>
+void bitonic_group_sort(RandomIt first, SizeT group_size, SizeT problem_size,
+                        SizeT item, Barrier barrier, Compare comp) {
+
+  auto process_pass = [=](SizeT j) {
+    for(SizeT a_id = item; a_id < problem_size; a_id += group_size) {
+      SizeT b_id = a_id ^ j;
+      if(detail::can_compare(a_id, b_id, problem_size)) {
+        auto a = *detail::advance_to(first, a_id);
+        auto b = *detail::advance_to(first, b_id);
+        if(comp(b, a)) {
+          *detail::advance_to(first, a_id) = b;
+          *detail::advance_to(first, b_id) = a;
+        }
+      }
+    }
+    barrier();
+  };
+
+  for(SizeT k = 2; (k >> 1) < problem_size; k *= 2) {
+    process_pass(k-1);
+
+    for (SizeT j = k >> 1; j > 0; j >>= 1) {
+      process_pass(j);
+    }
+  }
+}
 
 template <class RandomIt, class Comparator>
 sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
@@ -64,6 +91,7 @@ sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
       most_recent_event = q.parallel_for(problem_size, k);
     else
       most_recent_event = q.parallel_for(problem_size, most_recent_event, k);
+    is_first_kernel = false;
   };
 
   for (std::size_t k = 2; (k >> 1) < problem_size; k *= 2) {

From b2f0c152da98f12d40e1f76b667575759f70e1fa Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 25 Oct 2024 03:48:17 +0200
Subject: [PATCH 029/126] [stdpar] merge: Use segmented merge when on CPU

---
 include/hipSYCL/algorithms/algorithm.hpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index 1e5509000..ce9d6b884 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -486,11 +486,12 @@ sycl::event merge(sycl::queue& q,
   if(problem_size == 0)
     return sycl::event{};
 
-  //return merging::segmented_merge(q, first1, last1, first2, last2, d_first,
-  //                                comp);
-
-  return merging::hierarchical_hybrid_merge(q, scratch_allocations, first1,
-                                            last1, first2, last2, d_first, comp);
+  if (q.get_device().get_backend() == sycl::backend::omp)
+    return merging::segmented_merge(q, first1, last1, first2, last2, d_first,
+                                    comp);
+  else
+    return merging::hierarchical_hybrid_merge(
+        q, scratch_allocations, first1, last1, first2, last2, d_first, comp);
 }
 
 }

From 652acf653386a85762ee6676aabf1b87ffe13b2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20B=C3=BCttner?= <markus.buettner@uni-bayreuth.de>
Date: Fri, 25 Oct 2024 08:44:40 +0200
Subject: [PATCH 030/126] Pass -cl-denorms-are-zero to OpenCL

As mentioned in #1584, denormal numbers can sometimes
cause performance degradation. For PTX and amdgcn
this is already done, but has to be passed to the
OpenCL build flags.
---
 src/runtime/ocl/ocl_code_object.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/ocl/ocl_code_object.cpp b/src/runtime/ocl/ocl_code_object.cpp
index c0398eb43..ad352186f 100644
--- a/src/runtime/ocl/ocl_code_object.cpp
+++ b/src/runtime/ocl/ocl_code_object.cpp
@@ -54,7 +54,7 @@ ocl_executable_object::ocl_executable_object(const cl::Context& ctx, cl::Device&
   std::string options_string="-cl-uniform-work-group-size";
   for(const auto& flag : config.build_flags()) {
     if(flag == kernel_build_flag::fast_math) {
-      options_string += " -cl-fast-relaxed-math";
+      options_string += " -cl-fast-relaxed-math -cl-denorms-are-zero";
     }
   }
 

From 7f94a87f4b09395ac0150f60066f47a401484fbd Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 8 Nov 2024 00:18:12 +0100
Subject: [PATCH 031/126] Add experimental --acpp-export-all feature to make
 SYCL_EXTERNAL unnecessary

---
 bin/acpp                                      |  15 ++-
 .../compiler/llvm-to-backend/Utils.hpp        |   8 +-
 .../compiler/sscp/KernelOutliningPass.hpp     |   3 +
 .../llvm-to-backend/LLVMToBackend.cpp         | 124 +++++++++---------
 src/compiler/sscp/KernelOutliningPass.cpp     |  41 ++++--
 src/compiler/sscp/TargetSeparationPass.cpp    |  11 +-
 tests/compiler/sscp/export-all/export_all.cpp |  26 ++++
 tests/compiler/sscp/export-all/lit.local.cfg  |   1 +
 tests/compiler/sscp/export-all/second_tu.cpp  |   5 +
 9 files changed, 155 insertions(+), 79 deletions(-)
 create mode 100644 tests/compiler/sscp/export-all/export_all.cpp
 create mode 100644 tests/compiler/sscp/export-all/lit.local.cfg
 create mode 100644 tests/compiler/sscp/export-all/second_tu.cpp

diff --git a/bin/acpp b/bin/acpp
index 3891749e7..c9d24077a 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -363,7 +363,10 @@ class acpp_config:
       'stdpar-unconditional-offload' : option("--acpp-stdpar-unconditional-offload", "ACPP_STDPAR_UNCONDITIONAL_OFFLOAD", "default-is-stdpar-unconditional-offload",
 """  Normally, heuristics are employed to determine whether algorithms should be offloaded.
   This particularly affects small problem sizes. If this flag is set, supported parallel STL
-  algorithms will be offloaded unconditionally.""")
+  algorithms will be offloaded unconditionally."""),
+      'is-export-all' : option("--acpp-export-all", "ACPP_EXPORT_ALL", "default-export-all",
+"""  (Experimental) Treat all functions implicitly as SYCL_EXTERNAL. Only supported with generic target.
+  This currently only works with translation units that include the sycl.hpp header.""")
     }
 
 
@@ -812,6 +815,13 @@ class acpp_config:
     except OptionNotSet:
       return False
 
+  @property
+  def is_export_all(self):
+    try:
+      return self._is_flag_set("is-export-all")
+    except OptionNotSet:
+      return False
+
   @property
   def is_stdpar(self):
     try:
@@ -1521,6 +1531,9 @@ class llvm_sscp_invocation:
   def get_cxx_flags(self):
     flags = ["-D__ACPP_ENABLE_LLVM_SSCP_TARGET__",
             "-Xclang", "-disable-O0-optnone", "-mllvm", "-acpp-sscp"]
+    
+    if self._config.is_export_all:
+      flags += ["-mllvm","-acpp-sscp-export-all"]
 
     sscp_compile_opts = []
     if ("-Ofast" in self._config.forwarded_compiler_arguments or
diff --git a/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp b/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp
index 9e5edab37..883358be9 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp
@@ -70,7 +70,7 @@ inline llvm::Error loadModuleFromString(const std::string &LLVMIR, llvm::LLVMCon
 }
 
 template<class F>
-inline void constructPassBuilder(F&& handler) {
+inline auto withPassBuilder(F&& handler) {
   llvm::LoopAnalysisManager LAM;
   llvm::FunctionAnalysisManager FAM;
   llvm::CGSCCAnalysisManager CGAM;
@@ -82,11 +82,11 @@ inline void constructPassBuilder(F&& handler) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  handler(PB, LAM, FAM, CGAM, MAM);
+  return handler(PB, LAM, FAM, CGAM, MAM);
 }
 
 template<class F>
-inline void constructPassBuilderAndMAM(F&& handler) {
+inline auto withPassBuilderAndMAM(F&& handler) {
   llvm::LoopAnalysisManager LAM;
   llvm::FunctionAnalysisManager FAM;
   llvm::CGSCCAnalysisManager CGAM;
@@ -98,7 +98,7 @@ inline void constructPassBuilderAndMAM(F&& handler) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  handler(PB, MAM);
+  return handler(PB, MAM);
 }
 
 
diff --git a/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp b/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp
index f1ecca091..9f44b4e06 100644
--- a/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp
+++ b/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp
@@ -20,6 +20,8 @@ namespace compiler {
 
 class EntrypointPreparationPass : public llvm::PassInfoMixin<EntrypointPreparationPass> {
 public:
+  EntrypointPreparationPass(bool ExportByDefault = false);
+
   llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
 
   const std::vector<std::string>& getKernelNames() const {
@@ -38,6 +40,7 @@ class EntrypointPreparationPass : public llvm::PassInfoMixin<EntrypointPreparati
   std::vector<std::string> KernelNames;
   std::vector<std::string> OutliningEntrypoints;
   std::vector<std::string> NonKernelOutliningEntrypoints;
+  bool ExportAll;
 };
 
 //  Removes all code not belonging to kernels
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index b778c2536..d8a5e59b0 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -218,33 +218,36 @@ bool LLVMToBackendTranslator::fullTransformation(const std::string &LLVMIR, std:
 }
 
 bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
-  HIPSYCL_DEBUG_INFO << "LLVMToBackend: Preparing backend flavoring...\n";
-
-  if(!this->prepareBackendFlavor(M))
-    return false;
-
-  // We need to resolve symbols now instead of after optimization, because we
-  // may have to reuotline if the code that is linked in after symbol resolution
-  // depends on IR constants.
-  // This also means that we cannot error yet if we cannot resolve all symbols :(
-  resolveExternalSymbols(M);
 
-  HIPSYCL_DEBUG_INFO << "LLVMToBackend: Applying specializations and S2 IR constants...\n";
-  for(auto& A : SpecializationApplicators) {
-    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Processing specialization " << A.first << "\n";
-    A.second(M);
-  }
-  // Return error in case applying specializations has caused error list to be populated
-  if(!Errors.empty())
-    return false;
-
-  bool ContainsUnsetIRConstants = false;
-  bool FlavoringSuccessful = false;
-  bool OptimizationSuccessful = false;
+  HIPSYCL_DEBUG_INFO << "LLVMToBackend: Preparing backend flavoring...\n";
 
-  constructPassBuilderAndMAM([&](llvm::PassBuilder &PB, llvm::ModuleAnalysisManager &MAM) {
+  return withPassBuilderAndMAM([&](llvm::PassBuilder &PB, llvm::ModuleAnalysisManager &MAM) {
     PassHandler PH {&PB, &MAM};
 
+    // Do an initial outlining to simplify the code, particularly to reduce
+    // linking complexity if --acpp-export-all is used
+    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Reoutlining kernels...\n";
+    KernelOutliningPass InitialOutlining{OutliningEntrypoints};
+    InitialOutlining.run(M, MAM);
+    
+    // We need to resolve symbols now instead of after optimization, because we
+    // may have to reuotline if the code that is linked in after symbol resolution
+    // depends on IR constants.
+    // This also means that we cannot error yet if we cannot resolve all symbols :(
+    resolveExternalSymbols(M);
+
+    if(!this->prepareBackendFlavor(M))
+      return false;
+
+    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Applying specializations and S2 IR constants...\n";
+    for(auto& A : SpecializationApplicators) {
+      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Processing specialization " << A.first << "\n";
+      A.second(M);
+    }
+    // Return error in case applying specializations has caused error list to be populated
+    if(!Errors.empty())
+      return false;
+
     // Optimize away unnecessary branches due to backend-specific S2IR constants
     // This is what allows us to specialize code for different backends.
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing branches post S2 IR constant application...\n";
@@ -281,55 +284,56 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     ICP.run(M, MAM);
 
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Adding backend-specific flavor to IR...\n";
-    FlavoringSuccessful = this->toBackendFlavor(M, PH);
+    if(!this->toBackendFlavor(M, PH)) {
+      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Flavoring failed\n";
+      return false;
+    }
+
     // Inline again to handle builtin definitions pulled in by backend flavors
     InliningPass.run(M, MAM);
 
-    if(FlavoringSuccessful) {
-      // Run optimizations
-      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing flavored IR...\n";
+    // Run optimizations
+    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing flavored IR...\n";
 
-      if(IsFastMath)
-        setFastMathFunctionAttribs(M);
+    if(IsFastMath)
+      setFastMathFunctionAttribs(M);
 
-      // Remove argument_used hints, which are no longer needed once we enter optimization stage.
-      // This is primarily needed for dynamic functions.
-      utils::ProcessFunctionAnnotationPass PFA({"argument_used"});
-      PFA.run(M, MAM);
+    // Remove argument_used hints, which are no longer needed once we enter optimization stage.
+    // This is primarily needed for dynamic functions.
+    utils::ProcessFunctionAnnotationPass PFA({"argument_used"});
+    PFA.run(M, MAM);
 
-      MAM.clear();
+    MAM.clear(); 
 
-      OptimizationSuccessful = optimizeFlavoredIR(M, PH);
-
-      if(!OptimizationSuccessful) {
-        this->registerError("LLVMToBackend: Optimization failed");
-      }
+    if(!optimizeFlavoredIR(M, PH)) {
+      this->registerError("LLVMToBackend: Optimization failed");
+      return false;
+    }
 
-      for(const auto& Entry : FunctionsForDeadArgumentElimination) {
-        if(auto* F = M.getFunction(Entry.first)) {
-          if(isKernelAfterFlavoring(*F)) {
-            runKernelDeadArgumentElimination(M, F, PH, *Entry.second);
-          }
+    for(const auto& Entry : FunctionsForDeadArgumentElimination) {
+      if(auto* F = M.getFunction(Entry.first)) {
+        if(isKernelAfterFlavoring(*F)) {
+          runKernelDeadArgumentElimination(M, F, PH, *Entry.second);
         }
       }
-      llvm::AlwaysInlinerPass AIP;
-      AIP.run(M, MAM);
-
-      S2IRConstant::forEachS2IRConstant(M, [&](S2IRConstant C) {
-        if (C.isValid()) {
-          if (!C.isInitialized()) {
-            ContainsUnsetIRConstants = true;
-            this->registerError("LLVMToBackend: AdaptiveCpp S2IR constant was not set: " +
-                                C.getGlobalVariable()->getName().str());
-          }
-        }
-      });
-    } else {
-      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Flavoring failed\n";
     }
-  });
+    llvm::AlwaysInlinerPass{}.run(M, MAM);
+
+    bool ContainsUnsetIRConstants = false;
+    S2IRConstant::forEachS2IRConstant(M, [&](S2IRConstant C) {
+      if (C.isValid()) {
+        if (!C.isInitialized()) {
+          ContainsUnsetIRConstants = true;
+          this->registerError("LLVMToBackend: AdaptiveCpp S2IR constant was not set: " +
+                              C.getGlobalVariable()->getName().str());
+        }
+      }
+    });
+    if(ContainsUnsetIRConstants)
+      return false;
 
-  return FlavoringSuccessful && OptimizationSuccessful && !ContainsUnsetIRConstants;
+    return true;
+  });
 }
 
 bool LLVMToBackendTranslator::translatePreparedIR(llvm::Module &FlavoredModule, std::string &out) {
diff --git a/src/compiler/sscp/KernelOutliningPass.cpp b/src/compiler/sscp/KernelOutliningPass.cpp
index d62d348db..8cb5d328d 100644
--- a/src/compiler/sscp/KernelOutliningPass.cpp
+++ b/src/compiler/sscp/KernelOutliningPass.cpp
@@ -257,6 +257,9 @@ void canonicalizeKernelParameters(llvm::Function* F, llvm::Module& M) {
 
 }
 
+EntrypointPreparationPass::EntrypointPreparationPass(bool ExportByDefault)
+: ExportAll{ExportByDefault} {}
+
 llvm::PreservedAnalyses
 EntrypointPreparationPass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
 
@@ -265,6 +268,24 @@ EntrypointPreparationPass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM)
 
   llvm::SmallSet<std::string, 16> Kernels;
 
+
+  llvm::DenseSet<llvm::Function*> MarkedFunctions;
+  auto MarkThisFunctionForOutlining = [&](llvm::Function* F) {
+    HIPSYCL_DEBUG_INFO << "Found SSCP outlining entrypoint: " << F->getName() << "\n";
+    // Make kernel have external linkage to avoid having everything optimized away
+    F->setLinkage(llvm::GlobalValue::ExternalLinkage);
+
+    // If we have a definition, we need to perform outlining.
+    // Otherwise, we would need to treat the function as imported --
+    // however this cannot really happen as clang does not codegen our
+    // attribute((annotate("hipsycl_sscp_outlining"))) for declarations
+    // without definition.
+    if(F->size() > 0 && !MarkedFunctions.contains(F)) {
+      this->OutliningEntrypoints.push_back(F->getName().str());
+      MarkedFunctions.insert(F);
+    }
+  };
+
   utils::findFunctionsWithStringAnnotationsWithArg(M, [&](llvm::Function* F, llvm::StringRef Annotation, llvm::Constant* Argument){
     if(F) {
       if(Annotation.compare(SscpKernelDimensionName) == 0){
@@ -295,22 +316,20 @@ EntrypointPreparationPass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM)
         this->KernelNames.push_back(F->getName().str());
         Kernels.insert(F->getName().str());
       }
+
       if(Annotation.compare(SSCPOutliningMarker) == 0) {
-        HIPSYCL_DEBUG_INFO << "Found SSCP outlining entrypoint: " << F->getName() << "\n";
-        // Make kernel have external linkage to avoid having everything optimized away
-        F->setLinkage(llvm::GlobalValue::ExternalLinkage);
-
-        // If we have a definition, we need to perform outlining.
-        // Otherwise, we would need to treat the function as imported --
-        // however this cannot really happen as clang does not codegen our
-        // attribute((annotate("hipsycl_sscp_outlining"))) for declarations
-        // without definition.
-        if(F->size() > 0)
-          this->OutliningEntrypoints.push_back(F->getName().str());
+        MarkThisFunctionForOutlining(F);
       }
     }
   });
 
+  if(ExportAll) {
+    for(auto& F: M) {
+      if (!F.isIntrinsic() && F.getLinkage() != llvm::GlobalValue::LinkageTypes::InternalLinkage)
+        MarkThisFunctionForOutlining(&F);
+    }
+  }
+
 
   for(const auto& EP : OutliningEntrypoints) {
     if(!Kernels.contains(EP)) {
diff --git a/src/compiler/sscp/TargetSeparationPass.cpp b/src/compiler/sscp/TargetSeparationPass.cpp
index 9c9a735e3..6cd9e9cb3 100644
--- a/src/compiler/sscp/TargetSeparationPass.cpp
+++ b/src/compiler/sscp/TargetSeparationPass.cpp
@@ -99,13 +99,18 @@ class ScopedPrintingTimer : private Timer {
 
 static llvm::cl::opt<bool> SSCPEmitHcf{
     "acpp-sscp-emit-hcf", llvm::cl::init(false),
-    llvm::cl::desc{"Emit HCF from hipSYCL LLVM SSCP compilation flow"}};
+    llvm::cl::desc{"Emit HCF from AdaptiveCpp LLVM SSCP compilation flow"}};
 
 static llvm::cl::opt<bool> PreoptimizeSSCPKernels{
     "acpp-sscp-preoptimize", llvm::cl::init(false),
     llvm::cl::desc{
         "Preoptimize SYCL kernels in LLVM IR instead of embedding unoptimized kernels and relying "
-        "on optimization at runtime. This is mainly for hipSYCL developers and NOT supported!"}};
+        "on optimization at runtime. This is mainly for AdaptiveCpp developers and NOT supported!"}};
+
+static llvm::cl::opt<bool> ExportAllSymbols{
+    "acpp-sscp-export-all", llvm::cl::init(false),
+    llvm::cl::desc{
+        "(experimental) export all functions for JIT-time linking"}};
 
 static const char *SscpIsHostIdentifier = "__acpp_sscp_is_host";
 static const char *SscpIsDeviceIdentifier = "__acpp_sscp_is_device";
@@ -279,7 +284,7 @@ std::unique_ptr<llvm::Module> generateDeviceIR(llvm::Module &M,
     }
   }
 
-  EntrypointPreparationPass EPP;
+  EntrypointPreparationPass EPP{ExportAllSymbols};
   EPP.run(*DeviceModule, DeviceMAM);
   
   ExportedSymbolsOutput = EPP.getNonKernelOutliningEntrypoints();
diff --git a/tests/compiler/sscp/export-all/export_all.cpp b/tests/compiler/sscp/export-all/export_all.cpp
new file mode 100644
index 000000000..950649327
--- /dev/null
+++ b/tests/compiler/sscp/export-all/export_all.cpp
@@ -0,0 +1,26 @@
+// RUN: %acpp %s %S/second_tu.cpp -o %t --acpp-targets=generic --acpp-export-all
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s %S/second_tu.cpp -o %t --acpp-targets=generic -O3 --acpp-export-all
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s %S/second_tu.cpp -o %t --acpp-targets=generic -g --acpp-export-all
+// RUN: %t | FileCheck %s
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+#include "../common.hpp"
+
+// defined in second_tu.cpp
+int increment(int x);
+
+int main() {
+  sycl::queue q = get_queue();
+  int* data = sycl::malloc_shared<int>(1, q);
+  q.single_task([=](){
+    *data = increment(123);
+  });
+  q.wait();
+
+  // CHECK: 124
+  std::cout << *data << std::endl;
+  sycl::free(data, q);
+}
diff --git a/tests/compiler/sscp/export-all/lit.local.cfg b/tests/compiler/sscp/export-all/lit.local.cfg
new file mode 100644
index 000000000..0939c330d
--- /dev/null
+++ b/tests/compiler/sscp/export-all/lit.local.cfg
@@ -0,0 +1 @@
+config.excludes = ["second_tu.cpp"]
\ No newline at end of file
diff --git a/tests/compiler/sscp/export-all/second_tu.cpp b/tests/compiler/sscp/export-all/second_tu.cpp
new file mode 100644
index 000000000..d6b52a9b4
--- /dev/null
+++ b/tests/compiler/sscp/export-all/second_tu.cpp
@@ -0,0 +1,5 @@
+#include <sycl/sycl.hpp>
+
+int increment(int x) {
+  return x+1;
+}
\ No newline at end of file

From e2468b5769d1492f255acb93aa9beee6050b4e75 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 8 Nov 2024 00:51:55 +0100
Subject: [PATCH 032/126] Take into account that dynamic functions can cause
 delayed additions to the kernel graph

---
 .../compiler/llvm-to-backend/LLVMToBackend.hpp   |  4 ++++
 src/compiler/llvm-to-backend/LLVMToBackend.cpp   | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
index e530cd13e..81d8f4e58 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
@@ -232,9 +232,13 @@ class LLVMToBackendTranslator {
   int S2IRConstantBackendId;
   
   std::vector<std::string> OutliningEntrypoints;
+  // function call specializations might result in additional outlining entrypoints
+  // that we need to consider early on
+  std::vector<std::string> FunctionCallSpecializationOutliningEntrypoints;
   std::vector<std::string> Kernels;
 
   std::vector<std::string> Errors;
+  
   std::unordered_map<std::string, std::function<void(llvm::Module &)>> SpecializationApplicators;
   ExternalSymbolResolver SymbolResolver;
   bool HasExternalSymbolResolver = false;
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index d8a5e59b0..abebbc6f6 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -21,6 +21,7 @@
 #include "hipSYCL/compiler/utils/ProcessFunctionAnnotationsPass.hpp"
 #include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 #include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/sycl/access.hpp"
 
 #include <cstdint>
 
@@ -227,11 +228,17 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     // Do an initial outlining to simplify the code, particularly to reduce
     // linking complexity if --acpp-export-all is used
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Reoutlining kernels...\n";
-    KernelOutliningPass InitialOutlining{OutliningEntrypoints};
+    // Function call specializations are only handled at a later stage,
+    // so if the user has requested any, ensure that we don't throw them away
+    // since these functions will not yet appear in the call graph.
+    std::vector<std::string> InitialOutliningEntrypoints = OutliningEntrypoints;
+    for(const auto& FName : FunctionCallSpecializationOutliningEntrypoints)
+      InitialOutliningEntrypoints.push_back(FName);
+    KernelOutliningPass InitialOutlining{InitialOutliningEntrypoints};
     InitialOutlining.run(M, MAM);
     
     // We need to resolve symbols now instead of after optimization, because we
-    // may have to reuotline if the code that is linked in after symbol resolution
+    // may have to reoutline if the code that is linked in after symbol resolution
     // depends on IR constants.
     // This also means that we cannot error yet if we cannot resolve all symbols :(
     resolveExternalSymbols(M);
@@ -488,6 +495,11 @@ void LLVMToBackendTranslator::specializeKernelArgument(const std::string &Kernel
 void LLVMToBackendTranslator::specializeFunctionCalls(
     const std::string &FuncName, const std::vector<std::string> &ReplacementCalls,
     bool OverrideOnlyUndefined) {
+
+  for(const auto& FName : ReplacementCalls) {
+    this->FunctionCallSpecializationOutliningEntrypoints.push_back(FName);
+  }
+
   std::string Id = "__specialized_function_call_"+FuncName;
   SpecializationApplicators[Id] = [=](llvm::Module &M) {
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Specializing function calls to " << FuncName << " to:\n";

From d3d9b19a7537f9dd64bbb70dd18105d82ed1ba35 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 12 Nov 2024 03:18:20 +0100
Subject: [PATCH 033/126] [SSCP][Adaptivity] Specialize local memory accessor
 content at adaptivity level >= 1

---
 include/hipSYCL/runtime/kernel_configuration.hpp | 6 ++++++
 include/hipSYCL/sycl/libkernel/accessor.hpp      | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/hipSYCL/runtime/kernel_configuration.hpp b/include/hipSYCL/runtime/kernel_configuration.hpp
index 88d796a20..530a7e375 100644
--- a/include/hipSYCL/runtime/kernel_configuration.hpp
+++ b/include/hipSYCL/runtime/kernel_configuration.hpp
@@ -160,6 +160,12 @@ class kernel_configuration {
   }
 
   void set_specialized_kernel_argument(int param_index, uint64_t buffer_value) {
+    for(int i = 0; i < _specialized_kernel_args.size(); ++i) {
+      if(_specialized_kernel_args[i].first == param_index) {
+        _specialized_kernel_args[i] = std::make_pair(param_index, buffer_value);
+        return;
+      }
+    }
     _specialized_kernel_args.push_back(
         std::make_pair(param_index, buffer_value));
   }
diff --git a/include/hipSYCL/sycl/libkernel/accessor.hpp b/include/hipSYCL/sycl/libkernel/accessor.hpp
index 25a5b3829..b28631885 100644
--- a/include/hipSYCL/sycl/libkernel/accessor.hpp
+++ b/include/hipSYCL/sycl/libkernel/accessor.hpp
@@ -44,6 +44,7 @@
 #include "item.hpp"
 #include "multi_ptr.hpp"
 #include "atomic.hpp"
+#include "../specialized.hpp"
 #include "detail/local_memory_allocator.hpp"
 #include "detail/mobile_shared_ptr.hpp"
 
@@ -2148,8 +2149,8 @@ class accessor<
     : _addr{addr}, _num_elements{r}
   {}
 
-  address _addr{};
-  range<dimensions> _num_elements;
+  specialized<address> _addr;
+  specialized<range<dimensions>> _num_elements;
 };
 
 namespace detail::accessor {

From 393d2f916bfff03bd9717ed61f9bb2927f4b85e5 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 12 Nov 2024 04:00:47 +0100
Subject: [PATCH 034/126] [SSCP][libkernel][llvm-to-host] Assume that local
 memory is always aligned to 512 byte boundaries

---
 src/libkernel/sscp/host/localmem.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/libkernel/sscp/host/localmem.cpp b/src/libkernel/sscp/host/localmem.cpp
index b2fdadc29..7edd7022f 100644
--- a/src/libkernel/sscp/host/localmem.cpp
+++ b/src/libkernel/sscp/host/localmem.cpp
@@ -13,5 +13,10 @@
 extern "C" void* __acpp_cbs_sscp_dynamic_local_memory;
 
 __attribute__((address_space(3))) void* __acpp_sscp_get_dynamic_local_memory() {
-  return (__attribute__((address_space(3))) void*)(__acpp_cbs_sscp_dynamic_local_memory);
+
+  // We rely on the host side allocating page-aligned memory. On all relevant
+  // systems, the page size is larger than 512 bytes, so using this as a
+  // conservative minimum alignment seems safe.
+  return (__attribute__((address_space(3))) void *)(__builtin_assume_aligned(
+      __acpp_cbs_sscp_dynamic_local_memory, 512));
 }

From 1a720980c39586e62589f1f4e3d0e1b00d17a8cb Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 12 Nov 2024 19:47:13 +0100
Subject: [PATCH 035/126] omp_queue: Enforce minimum local mem alignment for
 SSCP JIT

---
 src/runtime/omp/omp_queue.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/runtime/omp/omp_queue.cpp b/src/runtime/omp/omp_queue.cpp
index 1a65f54dd..76b4a7098 100644
--- a/src/runtime/omp/omp_queue.cpp
+++ b/src/runtime/omp/omp_queue.cpp
@@ -210,9 +210,13 @@ launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
     // get page aligned local memory from heap
     static thread_local std::vector<char> local_memory;
 
-    const auto page_size = get_page_size();
-    local_memory.resize(shared_memory + page_size);
-    auto aligned_local_memory = reinterpret_cast<void*>(next_multiple_of(reinterpret_cast<std::uint64_t>(local_memory.data()), page_size));
+    // compiler/libkernel builtins assume that local mem is aligned to at least
+    // 512 byte boundaries
+    const auto local_mem_alignment = std::max(std::size_t{512}, get_page_size());
+    local_memory.resize(shared_memory + local_mem_alignment);
+    auto aligned_local_memory = reinterpret_cast<void *>(
+        next_multiple_of(reinterpret_cast<std::uint64_t>(local_memory.data()),
+                         local_mem_alignment));
 
 #ifdef _OPENMP
 #pragma omp for collapse(3)

From a3a915539493befbe762f171ea584f8e103ee5c4 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 16 Nov 2024 04:59:06 +0100
Subject: [PATCH 036/126] [SSCP] Add SSCP JIT-time reflection infrastructure

---
 .../algorithms/util/memory_streaming.hpp      |   3 +-
 .../llvm-to-backend/LLVMToBackend.hpp         |  17 +--
 .../ProcessS2ReflectionPass.hpp               |  39 +++++++
 .../hipSYCL/glue/llvm-sscp/ir_constants.hpp   |  31 ------
 .../glue/llvm-sscp/jit-reflection/queries.hpp |  57 ++++++++++
 .../jit-reflection/reflection_map.hpp         |  44 ++++++++
 include/hipSYCL/glue/llvm-sscp/jit.hpp        |  30 +++--
 .../glue/llvm-sscp/s2_ir_constants.hpp        |  51 ---------
 include/hipSYCL/runtime/cuda/cuda_queue.hpp   |   2 +
 include/hipSYCL/runtime/hardware.hpp          |   4 +-
 .../runtime/hip/hip_hardware_manager.hpp      |   3 +
 include/hipSYCL/runtime/hip/hip_queue.hpp     |   2 +
 include/hipSYCL/runtime/ocl/ocl_queue.hpp     |   2 +
 include/hipSYCL/runtime/omp/omp_queue.hpp     |   5 +-
 include/hipSYCL/runtime/ze/ze_queue.hpp       |   2 +
 include/hipSYCL/sycl/jit.hpp                  |   1 +
 src/compiler/llvm-to-backend/CMakeLists.txt   |   1 +
 .../llvm-to-backend/LLVMToBackend.cpp         |  17 ++-
 .../ProcessS2ReflectionPass.cpp               | 104 ++++++++++++++++++
 .../llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp   |   6 +-
 .../llvm-to-backend/host/LLVMToHost.cpp       |   5 +-
 .../llvm-to-backend/ptx/LLVMToPtx.cpp         |   5 +-
 .../llvm-to-backend/spirv/LLVMToSpirv.cpp     |   5 +-
 src/runtime/cuda/cuda_hardware_manager.cpp    |   6 +
 src/runtime/cuda/cuda_queue.cpp               |  10 +-
 src/runtime/hip/hip_hardware_manager.cpp      |  34 ++++++
 src/runtime/hip/hip_queue.cpp                 |  10 +-
 src/runtime/ocl/ocl_hardware_manager.cpp      |   7 ++
 src/runtime/ocl/ocl_queue.cpp                 |   9 +-
 src/runtime/omp/omp_backend.cpp               |   8 +-
 src/runtime/omp/omp_hardware_manager.cpp      |   7 ++
 src/runtime/omp/omp_queue.cpp                 |  12 +-
 src/runtime/ze/ze_hardware_manager.cpp        |   7 ++
 src/runtime/ze/ze_queue.cpp                   |   6 +-
 tests/compiler/sscp/s2_reflection.cpp         |  68 ++++++++++++
 35 files changed, 485 insertions(+), 135 deletions(-)
 create mode 100644 include/hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp
 create mode 100644 include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
 create mode 100644 include/hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp
 delete mode 100644 include/hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp
 create mode 100644 src/compiler/llvm-to-backend/ProcessS2ReflectionPass.cpp
 create mode 100644 tests/compiler/sscp/s2_reflection.cpp

diff --git a/include/hipSYCL/algorithms/util/memory_streaming.hpp b/include/hipSYCL/algorithms/util/memory_streaming.hpp
index 4d0b87512..134be33ce 100644
--- a/include/hipSYCL/algorithms/util/memory_streaming.hpp
+++ b/include/hipSYCL/algorithms/util/memory_streaming.hpp
@@ -62,7 +62,8 @@ class data_streamer {
   static void run(std::size_t problem_size, sycl::nd_item<1> idx,
                   F &&f) noexcept {
     __acpp_if_target_sscp(
-      if(sycl::jit::introspect<sycl::jit::current_backend, int>() == sycl::jit::backend::host) {
+      if(__acpp_sscp_jit_reflect_compiler_backend() == 
+         sycl::jit::compiler_backend::host) {
         run_host(problem_size, idx, f);
       } else {
         run_device(problem_size, idx, f);
diff --git a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
index 81d8f4e58..6e8c71bfc 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
@@ -23,7 +23,7 @@
 #include <typeinfo>
 #include <functional>
 #include "AddressSpaceMap.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/runtime/util.hpp"
 
 namespace llvm {
@@ -50,17 +50,6 @@ class LLVMToBackendTranslator {
 
   virtual ~LLVMToBackendTranslator() {}
 
-  // Do not use inside llvm-to-backend infrastructure targets to avoid
-  // requiring RTTI-enabled LLVM
-  template<auto& ConstantName, class T>
-  void setS2IRConstant(const T& value) {
-    static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>,
-                  "Unsupported type for S2 IR constant");
-
-    std::string name = typeid(__acpp_sscp_s2_ir_constant<ConstantName, T>).name();
-    setS2IRConstant<T>(name, value);
-  }
-
   template<class T>
   void setS2IRConstant(const std::string& name, T value) {
     setS2IRConstant(name, static_cast<const void*>(&value));
@@ -82,6 +71,8 @@ class LLVMToBackendTranslator {
     return setBuildOption(Option, std::to_string(Value));
   }
 
+  void setReflectionField(const std::string& name, uint64_t value);
+
   // Does partial transformation to backend-flavored LLVM IR
   bool partialTransformation(const std::string& LLVMIR, std::string& out);
 
@@ -248,6 +239,8 @@ class LLVMToBackendTranslator {
 
   std::vector<std::pair<std::string, std::vector<int>*>> FunctionsForDeadArgumentElimination;
 
+  std::unordered_map<std::string, uint64_t> ReflectionFields;
+
 };
 
 }
diff --git a/include/hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp b/include/hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp
new file mode 100644
index 000000000..bb97531f0
--- /dev/null
+++ b/include/hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp
@@ -0,0 +1,39 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_S2_REFLECTION_HPP
+#define ACPP_S2_REFLECTION_HPP
+
+#include <llvm/IR/PassManager.h>
+#include <unordered_map>
+#include <string>
+#include <cstdint>
+
+namespace hipsycl {
+namespace compiler {
+
+// Processes calls to 
+// - __acpp_jit_reflect_<name> or __acpp_s2_reflect_<name> functions (different synonyms),
+// replacing callsites with provided constants.
+// - __acpp_jit_reflect_knows_<name> or __acpp_s2_reflect_knows_<>
+class ProcessS2ReflectionPass : public llvm::PassInfoMixin<ProcessS2ReflectionPass> {
+public:
+  ProcessS2ReflectionPass(const std::unordered_map<std::string, uint64_t>& Fields);
+  llvm::PreservedAnalyses run(llvm::Module &M,
+                              llvm::ModuleAnalysisManager &MAM);
+private:
+  std::unordered_map<std::string, uint64_t> SupportedFields;
+};
+
+}
+}
+
+#endif
+
diff --git a/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp b/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp
index 2ff117c05..e72e1ca3e 100644
--- a/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp
@@ -11,37 +11,6 @@
 #ifndef HIPSYCL_IR_CONSTANTS_HPP
 #define HIPSYCL_IR_CONSTANTS_HPP
 
-#include <type_traits>
-
 #include "s1_ir_constants.hpp"
-#include "s2_ir_constants.hpp"
-
-template <auto &ConstantName, class ValueT>
-ValueT __acpp_sscp_s2_ir_constant<ConstantName, ValueT>::get(
-    ValueT default_value) noexcept {
-  // The static variable will cause clang to emit a global variable in LLVM IR,
-  // that we will turn into a constant during S2 compilation.
-  //
-  // TODO We may have to suppress compiler warnings about uninitialized data
-  // here
-  //
-  // S2 Compiler will look for special identifier __acpp_ir_constant_v to
-  // distinguish the actual IR constant from other global variables related to
-  // this class (e.g. type information).
-  static ValueT __acpp_ir_constant_v;
-  if (__acpp_sscp_is_device) {
-    return __acpp_ir_constant_v;
-  } else {
-    return default_value;
-  }
-}
-
-namespace hipsycl::sycl::jit {
-
-template <auto &ConstantName, class ValueT>
-auto introspect(ValueT default_value = {}) noexcept {
-  return __acpp_sscp_s2_ir_constant<ConstantName, ValueT>::get(default_value);
-}
-}
 
 #endif
diff --git a/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp b/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
new file mode 100644
index 000000000..b2f9f4ed3
--- /dev/null
+++ b/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
@@ -0,0 +1,57 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_GLUE_JIT_REFLECTION_QUERIES_HPP
+#define ACPP_GLUE_JIT_REFLECTION_QUERIES_HPP
+
+
+namespace hipsycl{
+namespace sycl {
+namespace jit {
+
+enum class compiler_backend : int {
+  spirv = 0,
+  ptx = 1,
+  amdgpu = 2,
+  host = 3
+};
+
+namespace vendor_id {
+
+inline constexpr int nvidia = 4318;
+inline constexpr int amd = 1022;
+inline constexpr int intel = 8086;
+
+}
+
+}
+}
+}
+
+
+
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_vendor_id();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_arch();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_has_independent_forward_progress();
+extern "C" bool __acpp_sscp_jit_reflect_knows_runtime_backend();
+extern "C" bool __acpp_sscp_jit_reflect_knows_compiler_backend();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_is_cpu();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_is_gpu();
+
+extern "C" int __acpp_sscp_jit_reflect_target_vendor_id();
+extern "C" int __acpp_sscp_jit_reflect_target_arch();
+extern "C" bool __acpp_sscp_jit_reflect_target_is_cpu();
+extern "C" bool __acpp_sscp_jit_reflect_target_is_gpu();
+extern "C" bool __acpp_sscp_jit_reflect_target_has_independent_forward_progress();
+extern "C" int __acpp_sscp_jit_reflect_runtime_backend();
+extern "C" hipsycl::sycl::jit::compiler_backend __acpp_sscp_jit_reflect_compiler_backend();
+
+
+#endif
diff --git a/include/hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp b/include/hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp
new file mode 100644
index 000000000..1f471677f
--- /dev/null
+++ b/include/hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp
@@ -0,0 +1,44 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_GLUE_JIT_REFLECTION_MAP_HPP
+#define ACPP_GLUE_JIT_REFLECTION_MAP_HPP
+
+#include <unordered_map>
+#include <string>
+#include <cstdint>
+
+#include "hipSYCL/runtime/hardware.hpp"
+
+namespace hipsycl{
+namespace glue {
+namespace jit {
+
+using reflection_map = std::unordered_map<std::string, uint64_t>;
+
+inline reflection_map construct_default_reflection_map(rt::hardware_context* ctx) {
+  reflection_map rmap;
+  rmap["target_vendor_id"] = ctx->get_property(rt::device_uint_property::vendor_id);
+  rmap["target_has_independent_forward_progress"] = static_cast<uint64_t>(ctx->has(
+      rt::device_support_aspect::work_item_independent_forward_progress));
+  rmap["target_arch"] = ctx->get_property(rt::device_uint_property::architecture);
+  rmap["target_is_gpu"] = ctx->is_gpu() ? 1 : 0;
+  rmap["target_is_cpu"] = ctx->is_cpu() ? 1 : 0;
+
+  rmap["runtime_backend"] = ctx->get_property(rt::device_uint_property::backend_id);
+  // compiler_backend is set by the LLVMToBackend infrastructure.
+  return rmap;
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/hipSYCL/glue/llvm-sscp/jit.hpp b/include/hipSYCL/glue/llvm-sscp/jit.hpp
index d8dc5ea8d..2c0c47486 100644
--- a/include/hipSYCL/glue/llvm-sscp/jit.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/jit.hpp
@@ -21,6 +21,7 @@
 #include "hipSYCL/runtime/kernel_cache.hpp"
 #include "hipSYCL/runtime/kernel_configuration.hpp"
 #include "hipSYCL/runtime/application.hpp"
+#include "jit-reflection/reflection_map.hpp"
 #include <cstddef>
 #include <vector>
 #include <atomic>
@@ -230,14 +231,13 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
                           const std::string &source,
                           const rt::kernel_configuration &config,
                           const symbol_list_t& imported_symbol_names,
+                          const reflection_map& refl_map,
                           std::string &output) {
 
   assert(translator);
   runtime_linker configure_linker {translator, imported_symbol_names};
 
   // Apply configuration
-  translator->setS2IRConstant<sycl::jit::current_backend, int>(
-      translator->getBackendId());
   for(const auto& entry : config.s2_ir_entries()) {
     translator->setS2IRConstant(entry.get_name(), entry.get_data_buffer());
   }
@@ -271,6 +271,11 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
     translator->setBuildFlag(rt::to_string(flag));
   }
 
+  // Set up JIT-time reflection for the code we compile
+  for(const auto& KV : refl_map) {
+    translator->setReflectionField(KV.first, KV.second);
+  }
+
   // Transform code
   if(!translator->fullTransformation(source, output)) {
     // In case of failure, if a dump directory for IR is set,
@@ -307,6 +312,7 @@ inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
                           const common::hcf_container* hcf,
                           const std::string& image_name,
                           const rt::kernel_configuration &config,
+                          const reflection_map& refl_map,
                           std::string &output) {
   assert(hcf);
   assert(hcf->root_node());
@@ -346,13 +352,15 @@ inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
   symbol_list_t imported_symbol_names =
       target_image_node->get_as_list("imported-symbols");
 
-  return compile(translator, source, config, imported_symbol_names, output);
+  return compile(translator, source, config, imported_symbol_names, refl_map,
+                 output);
 }
 
 inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
                           rt::hcf_object_id hcf_object,
                           const std::string& image_name,
                           const rt::kernel_configuration &config,
+                          const reflection_map& refl_map,
                           std::string &output) {
   const common::hcf_container* hcf = rt::hcf_cache::get().get_hcf(hcf_object);
   if(!hcf) {
@@ -362,17 +370,20 @@ inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
   }
 
   return compile(translator, hcf, image_name, config,
-                 output);
+                 refl_map, output);
 }
 
 namespace dead_argument_elimination {
 // Compiles with dead-argument-elimination for the kernels, and saves
 // the retained argument mask in the appdb. This only works for single-kernel
 // compilations!
-inline rt::result compile_kernel(
-    compiler::LLVMToBackendTranslator *translator, rt::hcf_object_id hcf_object,
-    const std::string &image_name, const rt::kernel_configuration &config,
-    rt::kernel_configuration::id_type binary_id, std::string &output) {
+inline rt::result compile_kernel(compiler::LLVMToBackendTranslator *translator,
+                                 rt::hcf_object_id hcf_object,
+                                 const std::string &image_name,
+                                 const rt::kernel_configuration &config,
+                                 rt::kernel_configuration::id_type binary_id,
+                                 const reflection_map &refl_map,
+                                 std::string &output) {
 
   assert(translator->getKernels().size() == 1);
 
@@ -386,7 +397,8 @@ inline rt::result compile_kernel(
 
         translator->enableDeadArgumentElminiation(translator->getKernels()[0],
                                                   retained_args);
-        err = compile(translator, hcf_object, image_name, config, output);
+        err = compile(translator, hcf_object, image_name, config, refl_map,
+                      output);
       });
 
   return err;
diff --git a/include/hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp b/include/hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp
deleted file mode 100644
index c1dd14b95..000000000
--- a/include/hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
- * parallelism for CPUs and GPUs.
- *
- * Copyright The AdaptiveCpp Contributors
- *
- * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
- * See file LICENSE in the project root for full license details.
- */
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef HIPSYCL_S2_IR_CONSTANTS_HPP
-#define HIPSYCL_S2_IR_CONSTANTS_HPP
-
-/// \brief This file contains S2 IR constant definitions that may
-/// be shared across the hipSYCL compiler code. 
-///
-/// As such, no undefined globals should be pulled into this file.
-///
-/// Unlike Stage 1 IR constants, Stage 2 IR constants can be constructed
-/// programmatically by the user.
-
-// S2 IR constants can be identified from their usage of
-// __acpp_sscp_s2_ir_constant
-template<auto& ConstantName, class ValueT>
-struct __acpp_sscp_s2_ir_constant {
-  static ValueT get(ValueT default_value) noexcept;
-
-  using value_type = ValueT;
-};
-
-
-namespace hipsycl::glue::sscp {
-  struct ir_constant_name {};
-}
-
-namespace hipsycl::sycl::jit {
-
-namespace backend {
-
-inline constexpr int spirv = 0;
-inline constexpr int ptx = 1;
-inline constexpr int amdgpu = 2;
-inline constexpr int host = 3;
-
-}
-
-constexpr glue::sscp::ir_constant_name current_backend;
-
-}
-
-#endif
diff --git a/include/hipSYCL/runtime/cuda/cuda_queue.hpp b/include/hipSYCL/runtime/cuda/cuda_queue.hpp
index 0003b6068..4c9e07a88 100644
--- a/include/hipSYCL/runtime/cuda/cuda_queue.hpp
+++ b/include/hipSYCL/runtime/cuda/cuda_queue.hpp
@@ -22,6 +22,7 @@
 #include "hipSYCL/runtime/code_object_invoker.hpp"
 #include "hipSYCL/runtime/cuda/cuda_event.hpp"
 #include "hipSYCL/runtime/kernel_configuration.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 
 
 // Forward declare CUstream_st instead of including cuda_runtime_api.h.
@@ -138,6 +139,7 @@ class cuda_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/hardware.hpp b/include/hipSYCL/runtime/hardware.hpp
index 4dc7d8495..654445fa2 100644
--- a/include/hipSYCL/runtime/hardware.hpp
+++ b/include/hipSYCL/runtime/hardware.hpp
@@ -93,7 +93,9 @@ enum class device_uint_property {
   printf_buffer_size,
   partition_max_sub_devices,
 
-  vendor_id
+  vendor_id,
+  architecture,
+  backend_id
 };
 
 enum class device_uint_list_property {
diff --git a/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp b/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
index 9b1336127..419f3dee9 100644
--- a/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
@@ -62,6 +62,9 @@ class hip_hardware_context : public hardware_context
   std::unique_ptr<hip_allocator> _allocator;
   std::unique_ptr<hip_event_pool> _event_pool;
   int _dev;
+  // target amdgcn architecture in numeric, hexadecimal form, e.g.
+  // gfx906 is represented as 0x906.
+  int _numeric_architecture;
 };
 
 class hip_hardware_manager : public backend_hardware_manager
diff --git a/include/hipSYCL/runtime/hip/hip_queue.hpp b/include/hipSYCL/runtime/hip/hip_queue.hpp
index a22e7563f..371e2f4f7 100644
--- a/include/hipSYCL/runtime/hip/hip_queue.hpp
+++ b/include/hipSYCL/runtime/hip/hip_queue.hpp
@@ -18,6 +18,7 @@
 
 #include "hipSYCL/common/spin_lock.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 #include "hip_instrumentation.hpp"
 
 // Avoid including HIP headers to prevent conflicts with CUDA
@@ -128,6 +129,7 @@ class hip_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/ocl/ocl_queue.hpp b/include/hipSYCL/runtime/ocl/ocl_queue.hpp
index ff4de7fde..0141488ec 100644
--- a/include/hipSYCL/runtime/ocl/ocl_queue.hpp
+++ b/include/hipSYCL/runtime/ocl/ocl_queue.hpp
@@ -19,6 +19,7 @@
 
 #include "hipSYCL/common/spin_lock.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 #include "hipSYCL/runtime/event.hpp"
 #include "hipSYCL/runtime/generic/async_worker.hpp"
 #include "hipSYCL/runtime/ocl/ocl_code_object.hpp"
@@ -107,6 +108,7 @@ class ocl_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/omp/omp_queue.hpp b/include/hipSYCL/runtime/omp/omp_queue.hpp
index 52d3320b9..b5a653ee7 100644
--- a/include/hipSYCL/runtime/omp/omp_queue.hpp
+++ b/include/hipSYCL/runtime/omp/omp_queue.hpp
@@ -17,11 +17,13 @@
 #include "../device_id.hpp"
 #include "hipSYCL/common/spin_lock.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 
 namespace hipsycl {
 namespace rt {
 
 class omp_queue;
+class omp_backend;
 
 class omp_sscp_code_object_invoker : public sscp_code_object_invoker {
 public:
@@ -50,7 +52,7 @@ class omp_sscp_code_object_invoker : public sscp_code_object_invoker {
 class omp_queue : public inorder_queue
 {
 public:
-  omp_queue(backend_id id);
+  omp_queue(omp_backend* be, int dev);
   virtual ~omp_queue();
 
   /// Inserts an event into the stream
@@ -94,6 +96,7 @@ class omp_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/ze/ze_queue.hpp b/include/hipSYCL/runtime/ze/ze_queue.hpp
index c37ceef73..045e4f115 100644
--- a/include/hipSYCL/runtime/ze/ze_queue.hpp
+++ b/include/hipSYCL/runtime/ze/ze_queue.hpp
@@ -18,6 +18,7 @@
 #include "../executor.hpp"
 #include "../inorder_queue.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 #include "hipSYCL/runtime/code_object_invoker.hpp"
 #include "hipSYCL/runtime/event.hpp"
 #include "hipSYCL/runtime/hints.hpp"
@@ -107,6 +108,7 @@ class ze_queue : public inorder_queue
   // SSCP submission data
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;  
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/sycl/jit.hpp b/include/hipSYCL/sycl/jit.hpp
index c81463ee4..912c7c08d 100644
--- a/include/hipSYCL/sycl/jit.hpp
+++ b/include/hipSYCL/sycl/jit.hpp
@@ -20,6 +20,7 @@
 #if ACPP_LIBKERNEL_IS_DEVICE_PASS_SSCP
 #include "hipSYCL/glue/reflection.hpp"
 #include "hipSYCL/glue/llvm-sscp/fcall_specialization.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/stable_running_hash.hpp"
 #include "hipSYCL/common/unordered_dense.hpp"
 #include "exception.hpp"
diff --git a/src/compiler/llvm-to-backend/CMakeLists.txt b/src/compiler/llvm-to-backend/CMakeLists.txt
index e5674efcd..ea62f010d 100644
--- a/src/compiler/llvm-to-backend/CMakeLists.txt
+++ b/src/compiler/llvm-to-backend/CMakeLists.txt
@@ -120,6 +120,7 @@ if(WITH_SSCP_COMPILER)
       GlobalSizesFitInI32OptPass.cpp
       GlobalInliningAttributorPass.cpp
       DeadArgumentEliminationPass.cpp
+      ProcessS2ReflectionPass.cpp
       ../sscp/KernelOutliningPass.cpp)
 
   if(WITH_LLVM_TO_SPIRV)
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index abebbc6f6..8f2397db1 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -15,12 +15,13 @@
 #include "hipSYCL/compiler/llvm-to-backend/GlobalInliningAttributorPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/KnownGroupSizeOptPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp"
+#include "hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
 #include "hipSYCL/compiler/sscp/KernelOutliningPass.hpp"
 #include "hipSYCL/compiler/utils/ProcessFunctionAnnotationsPass.hpp"
 #include "hipSYCL/compiler/utils/LLVMUtils.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/sycl/access.hpp"
 
 #include <cstdint>
@@ -255,6 +256,15 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     if(!Errors.empty())
       return false;
 
+    // Process stage 2 reflection calls
+    ReflectionFields["compiler_backend"] = this->getBackendId();
+    for(const auto& Fields : ReflectionFields) {
+      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Setting up reflection fields: " << Fields.first << " = "
+                         << Fields.second << "\n";
+    }
+    ProcessS2ReflectionPass S2RP{ReflectionFields};
+    S2RP.run(M, MAM);
+
     // Optimize away unnecessary branches due to backend-specific S2IR constants
     // This is what allows us to specialize code for different backends.
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing branches post S2 IR constant application...\n";
@@ -682,6 +692,11 @@ void LLVMToBackendTranslator::runKernelDeadArgumentElimination(
                        << "\n";
   }
 }
+
+void LLVMToBackendTranslator::setReflectionField(const std::string &str, uint64_t value) {
+  ReflectionFields[str] = value;
+}
+
 }
 }
 
diff --git a/src/compiler/llvm-to-backend/ProcessS2ReflectionPass.cpp b/src/compiler/llvm-to-backend/ProcessS2ReflectionPass.cpp
new file mode 100644
index 000000000..6605eee35
--- /dev/null
+++ b/src/compiler/llvm-to-backend/ProcessS2ReflectionPass.cpp
@@ -0,0 +1,104 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Constants.h>
+
+#include <algorithm>
+#include <cctype>
+
+namespace hipsycl {
+namespace compiler {
+
+namespace {
+
+void handleReflectionFunction(llvm::Module& M, llvm::Function& F, uint64_t Value) {
+  F.setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
+  auto *ReplacementValue = llvm::ConstantInt::get(
+      M.getContext(), llvm::APInt{F.getReturnType()->getIntegerBitWidth(), Value});
+  
+  llvm::SmallVector<llvm::CallBase*> CallsToRemove;
+  for(auto* U : F.users()) {
+    if(auto* CB = llvm::dyn_cast<llvm::CallBase>(U)){
+      CB->replaceNonMetadataUsesWith(ReplacementValue);
+      CallsToRemove.push_back(CB);
+    }
+  }
+  for (auto *C : CallsToRemove) {
+    C->replaceAllUsesWith(llvm::UndefValue::get(C->getType()));
+    C->dropAllReferences();
+    C->eraseFromParent();
+  }
+}
+
+std::string getQueryName(llvm::StringRef FunctionName, const std::string& Prefix) {
+  auto Pos = FunctionName.find(Prefix);
+  if(Pos == std::string::npos)
+    return {};
+
+  return FunctionName.str().substr(Pos+Prefix.length());
+}
+
+}
+
+ProcessS2ReflectionPass::ProcessS2ReflectionPass(
+    const std::unordered_map<std::string, uint64_t> &Fields) {
+
+  for(const auto& KV : Fields) {
+    std::string CanonicalizedKey = KV.first;
+
+    std::transform(CanonicalizedKey.begin(), CanonicalizedKey.end(), CanonicalizedKey.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+
+    for(auto& c : CanonicalizedKey)
+      if(!std::isalnum(c) && c != '_')
+        c='_';
+
+    SupportedFields[CanonicalizedKey] = KV.second;
+  }
+}
+
+llvm::PreservedAnalyses ProcessS2ReflectionPass::run(llvm::Module& M, llvm::ModuleAnalysisManager& MAM) {
+
+
+  auto processReflectionCalls = [&](const std::string &QueryPrefix,
+                                    const std::string &KnowsQueryPrefix) {
+    for(auto& F : M) {
+      // Note: The order of the if/else branch here assumes that
+      // QueryPrefix is a substring of KnowsQueryPrefix!
+      if(llvmutils::starts_with(F.getName(), KnowsQueryPrefix)) {
+        auto QueryName = getQueryName(F.getName(), KnowsQueryPrefix);
+        auto It = SupportedFields.find(QueryName);
+        if(It != SupportedFields.end())
+          handleReflectionFunction(M, F, 1);
+        else
+          handleReflectionFunction(M, F, 0);
+      } else if(llvmutils::starts_with(F.getName(), QueryPrefix)) {
+        auto QueryName = getQueryName(F.getName(), QueryPrefix);
+        auto It = SupportedFields.find(QueryName);
+        if(It != SupportedFields.end())
+          handleReflectionFunction(M, F, It->second);
+      } 
+    }
+  };
+
+  processReflectionCalls("__acpp_sscp_jit_reflect_", "__acpp_sscp_jit_reflect_knows_");
+  processReflectionCalls("__acpp_sscp_s2_reflect_", "__acpp_sscp_s2_reflect_knows_");
+
+  return llvm::PreservedAnalyses::none();
+}
+
+}
+}
\ No newline at end of file
diff --git a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
index c116e1881..0ebc5eb73 100644
--- a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
+++ b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
@@ -13,7 +13,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
 #include "hipSYCL/compiler/utils/LLVMUtils.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
 #include <llvm/IR/DataLayout.h>
@@ -209,12 +209,12 @@ class RocmDeviceLibs {
 };
 
 LLVMToAmdgpuTranslator::LLVMToAmdgpuTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::amdgpu, KN, KN}, KernelNames{KN} {
+    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::amdgpu), KN, KN},
+      KernelNames{KN} {
   RocmDeviceLibsPath = common::filesystem::join_path(RocmPath,
                                                      std::vector<std::string>{"amdgcn", "bitcode"});
 }
 
-
 bool LLVMToAmdgpuTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
   
   M.setTargetTriple(TargetTriple);
diff --git a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
index 8b776d9dc..10b7eb654 100644
--- a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
+++ b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
@@ -22,7 +22,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Bitcode/BitcodeWriter.h>
@@ -59,7 +59,8 @@ namespace hipsycl {
 namespace compiler {
 
 LLVMToHostTranslator::LLVMToHostTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::host, KN, KN}, KernelNames{KN} {}
+    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::host), KN, KN},
+      KernelNames{KN} {}
 
 bool LLVMToHostTranslator::toBackendFlavor(llvm::Module &M, PassHandler &PH) {
 
diff --git a/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp b/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
index aa0ba57a3..ff1538ab1 100644
--- a/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
+++ b/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
@@ -13,7 +13,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceInferencePass.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
 #include <llvm/ADT/SmallVector.h>
@@ -155,7 +155,8 @@ void replaceBrokenLLVMIntrinsics(llvm::Module& M) {
 }
 
 LLVMToPtxTranslator::LLVMToPtxTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::ptx, KN, KN}, KernelNames{KN} {}
+    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::ptx), KN, KN},
+      KernelNames{KN} {}
 
 bool LLVMToPtxTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
   std::string Triple = "nvptx64-nvidia-cuda";
diff --git a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
index d33522d2e..2a64a3440 100644
--- a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
+++ b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
@@ -15,7 +15,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
 #include "hipSYCL/compiler/utils/LLVMUtils.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
 #include <llvm/IR/Instructions.h>
@@ -123,7 +123,8 @@ void assignSPIRCallConvention(llvm::Function *F) {
 }
 
 LLVMToSpirvTranslator::LLVMToSpirvTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::spirv, KN, KN}, KernelNames{KN} {}
+    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::spirv), KN, KN},
+      KernelNames{KN} {}
 
 bool LLVMToSpirvTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
   
diff --git a/src/runtime/cuda/cuda_hardware_manager.cpp b/src/runtime/cuda/cuda_hardware_manager.cpp
index dd00c562e..e4e9819fa 100644
--- a/src/runtime/cuda/cuda_hardware_manager.cpp
+++ b/src/runtime/cuda/cuda_hardware_manager.cpp
@@ -359,6 +359,12 @@ cuda_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return 4318;
     break;
+  case device_uint_property::architecture:
+    return _properties->major * 10 + _properties->minor;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::cuda);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
diff --git a/src/runtime/cuda/cuda_queue.cpp b/src/runtime/cuda/cuda_queue.cpp
index 07fa5ba10..6e7726edb 100644
--- a/src/runtime/cuda/cuda_queue.cpp
+++ b/src/runtime/cuda/cuda_queue.cpp
@@ -176,6 +176,9 @@ cuda_queue::cuda_queue(cuda_backend *be, device_id dev, int priority)
       _kernel_cache{kernel_cache::get()} {
   this->activate_device();
 
+  _reflection_map = glue::jit::construct_default_reflection_map(
+      be->get_hardware_manager()->get_device(dev.get_id()));
+
   cudaError_t err;
   if(priority == 0) {
     err = cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking);
@@ -644,10 +647,11 @@ result cuda_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
-      err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+      err =
+          glue::jit::compile(translator.get(), hcf_object, selected_image_name,
+                             _config, _reflection_map, compiled_image);
     }
 
     if(!err.is_success()) {
diff --git a/src/runtime/hip/hip_hardware_manager.cpp b/src/runtime/hip/hip_hardware_manager.cpp
index 5455703e1..a1839229b 100644
--- a/src/runtime/hip/hip_hardware_manager.cpp
+++ b/src/runtime/hip/hip_hardware_manager.cpp
@@ -17,10 +17,37 @@
 #include <exception>
 #include <cstdlib>
 #include <limits>
+#include <cctype>
 
 namespace hipsycl {
 namespace rt {
 
+namespace {
+
+
+int device_arch_string_to_int(const std::string& device_name) {
+  std::string prefix = "gfx";
+  
+  if(device_name.find(prefix) != 0)
+    return 0;
+  
+  std::string substr = device_name;
+  substr.erase(0, prefix.length());
+
+  auto colon_pos = substr.find(":");
+  if(colon_pos != std::string::npos) {
+    substr.erase(colon_pos);
+  }
+
+  for(int i = 0; i < substr.length(); ++i) {
+    if(!std::isxdigit(substr[i]))
+      return 0;
+  }
+  return std::stoi(substr, nullptr, 16);
+}
+
+}
+
 hip_hardware_manager::hip_hardware_manager(hardware_platform hw_platform)
     : _hw_platform(hw_platform) {
   
@@ -97,6 +124,8 @@ hip_hardware_context::hip_hardware_context(int dev) : _dev{dev} {
   _allocator = std::make_unique<hip_allocator>(
       backend_descriptor{hardware_platform::rocm, api_platform::hip}, _dev);
   _event_pool = std::make_unique<hip_event_pool>(_dev);
+
+  _numeric_architecture = device_arch_string_to_int(get_device_arch());
 }
 
 hip_allocator* hip_hardware_context::get_allocator() const {
@@ -366,6 +395,11 @@ hip_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return 1022;
     break;
+  case device_uint_property::architecture:
+    return _numeric_architecture;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::hip);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
diff --git a/src/runtime/hip/hip_queue.cpp b/src/runtime/hip/hip_queue.cpp
index dd513ddc4..8d0e8f16a 100644
--- a/src/runtime/hip/hip_queue.cpp
+++ b/src/runtime/hip/hip_queue.cpp
@@ -167,6 +167,9 @@ hip_queue::hip_queue(hip_backend *be, device_id dev, int priority)
       _kernel_cache{kernel_cache::get()} {
   this->activate_device();
 
+  _reflection_map = glue::jit::construct_default_reflection_map(
+      be->get_hardware_manager()->get_device(dev.get_id()));
+
   hipError_t err;
   if(priority == 0) {
     err = hipStreamCreateWithFlags(&_stream, hipStreamNonBlocking);
@@ -642,10 +645,11 @@ result hip_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
-      err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+      err =
+          glue::jit::compile(translator.get(), hcf_object, selected_image_name,
+                             _config, _reflection_map, compiled_image);
     }
     
     if(!err.is_success()) {
diff --git a/src/runtime/ocl/ocl_hardware_manager.cpp b/src/runtime/ocl/ocl_hardware_manager.cpp
index b6c147154..86f42c9f5 100644
--- a/src/runtime/ocl/ocl_hardware_manager.cpp
+++ b/src/runtime/ocl/ocl_hardware_manager.cpp
@@ -469,6 +469,13 @@ std::size_t ocl_hardware_context::get_property(device_uint_property prop) const
     return static_cast<std::size_t>(
         info_query<CL_DEVICE_VENDOR_ID, cl_uint>(_dev));
     break;
+  case device_uint_property::architecture:
+    // TODO
+    return 0;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::ocl);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
diff --git a/src/runtime/ocl/ocl_queue.cpp b/src/runtime/ocl/ocl_queue.cpp
index 25690afb5..e3dea185a 100644
--- a/src/runtime/ocl/ocl_queue.cpp
+++ b/src/runtime/ocl/ocl_queue.cpp
@@ -125,6 +125,8 @@ ocl_queue::ocl_queue(ocl_hardware_manager* hw_manager, std::size_t device_index)
                    error_info{"ocl_queue: Couldn't construct backend queue",
                               error_code{"CL", err}});
   }
+
+  _reflection_map = glue::jit::construct_default_reflection_map(dev_ctx);
 }
 
 ocl_queue::~ocl_queue() {}
@@ -517,10 +519,11 @@ result ocl_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
-      err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+      err =
+          glue::jit::compile(translator.get(), hcf_object, selected_image_name,
+                             _config, _reflection_map, compiled_image);
     }
     
     if(!err.is_success()) {
diff --git a/src/runtime/omp/omp_backend.cpp b/src/runtime/omp/omp_backend.cpp
index 615763aec..c88b85fef 100644
--- a/src/runtime/omp/omp_backend.cpp
+++ b/src/runtime/omp/omp_backend.cpp
@@ -38,14 +38,14 @@ namespace rt {
 
 namespace {
 
-std::unique_ptr<inorder_queue> make_omp_queue(device_id dev) {
-  return std::make_unique<omp_queue>(dev.get_backend());
+std::unique_ptr<inorder_queue> make_omp_queue(omp_backend* be, device_id dev) {
+  return std::make_unique<omp_queue>(be, dev.get_id());
 }
 
 std::unique_ptr<multi_queue_executor>
 create_multi_queue_executor(omp_backend *b) {
-  return std::make_unique<multi_queue_executor>(*b, [](device_id dev) {
-    return make_omp_queue(dev);
+  return std::make_unique<multi_queue_executor>(*b, [b](device_id dev) {
+    return make_omp_queue(b, dev);
   });
 }
 
diff --git a/src/runtime/omp/omp_hardware_manager.cpp b/src/runtime/omp/omp_hardware_manager.cpp
index a1d056c73..d36aad6c7 100644
--- a/src/runtime/omp/omp_hardware_manager.cpp
+++ b/src/runtime/omp/omp_hardware_manager.cpp
@@ -265,6 +265,13 @@ omp_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return std::numeric_limits<std::size_t>::max();
     break;
+  case device_uint_property::architecture:
+    // TODO
+    return 0;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::omp);
+    break;
   }
   assert(false && "Invalid device property");
   return 0;
diff --git a/src/runtime/omp/omp_queue.cpp b/src/runtime/omp/omp_queue.cpp
index 76b4a7098..0b2a94687 100644
--- a/src/runtime/omp/omp_queue.cpp
+++ b/src/runtime/omp/omp_queue.cpp
@@ -21,6 +21,7 @@
 #include "hipSYCL/runtime/instrumentation.hpp"
 #include "hipSYCL/runtime/kernel_launcher.hpp"
 #include "hipSYCL/runtime/omp/omp_event.hpp"
+#include "hipSYCL/runtime/omp/omp_backend.hpp"
 #include "hipSYCL/runtime/operations.hpp"
 #include "hipSYCL/runtime/queue_completion_event.hpp"
 #include "hipSYCL/runtime/signal_channel.hpp"
@@ -236,9 +237,12 @@ launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
 #endif
 } // namespace
 
-omp_queue::omp_queue(backend_id id)
-    : _backend_id(id), _sscp_code_object_invoker{this},
-      _kernel_cache{kernel_cache::get()} {}
+omp_queue::omp_queue(omp_backend* be, int dev)
+    : _backend_id{be->get_unique_backend_id()}, _sscp_code_object_invoker{this},
+      _kernel_cache{kernel_cache::get()} {
+  _reflection_map = glue::jit::construct_default_reflection_map(
+      be->get_hardware_manager()->get_device(dev));
+}
 
 omp_queue::~omp_queue() { _worker.halt(); }
 
@@ -443,7 +447,7 @@ result omp_queue::submit_sscp_kernel_from_code_object(
 
     // Lower kernels to binary
     auto err = glue::jit::compile(translator.get(), hcf, selected_image_name,
-                                  _config, compiled_image);
+                                  _config, _reflection_map, compiled_image);
 
     if (!err.is_success()) {
       register_error(err);
diff --git a/src/runtime/ze/ze_hardware_manager.cpp b/src/runtime/ze/ze_hardware_manager.cpp
index 49511542c..9fe3aaada 100644
--- a/src/runtime/ze/ze_hardware_manager.cpp
+++ b/src/runtime/ze/ze_hardware_manager.cpp
@@ -449,6 +449,13 @@ std::size_t ze_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return _props.vendorId;
     break;
+  case device_uint_property::architecture:
+    // TODO
+    return 0;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::level_zero);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
diff --git a/src/runtime/ze/ze_queue.cpp b/src/runtime/ze/ze_queue.cpp
index 2ce9a774b..db281ad0f 100644
--- a/src/runtime/ze/ze_queue.cpp
+++ b/src/runtime/ze/ze_queue.cpp
@@ -149,6 +149,8 @@ ze_queue::ze_queue(ze_hardware_manager *hw_manager, std::size_t device_index)
 
   ze_hardware_context *hw_context =
       cast<ze_hardware_context>(hw_manager->get_device(device_index));
+    
+  _reflection_map = glue::jit::construct_default_reflection_map(hw_context);
   
   assert(hw_context);
 
@@ -504,10 +506,10 @@ result ze_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
       err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+        hcf_object, selected_image_name, _config, _reflection_map, compiled_image);
     }
     
     if(!err.is_success()) {
diff --git a/tests/compiler/sscp/s2_reflection.cpp b/tests/compiler/sscp/s2_reflection.cpp
new file mode 100644
index 000000000..dd4429017
--- /dev/null
+++ b/tests/compiler/sscp/s2_reflection.cpp
@@ -0,0 +1,68 @@
+
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O3
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O3 -ffast-math
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -g
+// RUN: %t | FileCheck %s
+
+#include <iostream>
+#include <cmath>
+#include <sycl/sycl.hpp>
+#include "common.hpp"
+#include "hipSYCL/runtime/hardware.hpp"
+
+extern "C" bool __acpp_sscp_jit_reflect_knows_random_unknown_thing();
+
+int main() {
+  sycl::queue q = get_queue();
+  int* data = sycl::malloc_shared<int>(6, q);
+
+  q.single_task([data]{
+    __acpp_if_target_device(
+      data[0] = __acpp_sscp_jit_reflect_runtime_backend();
+      data[1] = __acpp_sscp_jit_reflect_target_arch();
+      data[2] = __acpp_sscp_jit_reflect_target_is_cpu();
+      data[3] = static_cast<int>(__acpp_sscp_jit_reflect_compiler_backend());
+      data[4] = __acpp_sscp_jit_reflect_target_vendor_id();
+
+      data[5] = __acpp_sscp_jit_reflect_knows_runtime_backend();
+      data[6] = __acpp_sscp_jit_reflect_knows_random_unknown_thing();
+    );
+  }).wait();
+
+  auto dev = q.get_device().AdaptiveCpp_device_id();
+  hipsycl::rt::runtime_keep_alive_token rt;
+  hipsycl::rt::hardware_context *ctx = rt.get()
+                                           ->backends()
+                                           .get(dev.get_backend())
+                                           ->get_hardware_manager()
+                                           ->get_device(dev.get_id());
+
+  // CHECK: 1
+  std::cout << (data[0] == static_cast<int>(dev.get_backend())) << std::endl;
+  // CHECK: 1
+  std::cout << (data[1] ==
+                static_cast<int>(ctx->get_property(
+                    hipsycl::rt::device_uint_property::architecture)))
+            << std::endl;
+  // CHECK: 1
+  std::cout << (data[2] == static_cast<int>(ctx->is_cpu())) << std::endl;
+
+  // We don't have a mechanism yet to query compiler backends on the host, so
+  // cannot test data[3] for now.
+
+  // CHECK: 1
+  std::cout << (data[4] ==
+                static_cast<int>(ctx->get_property(
+                    hipsycl::rt::device_uint_property::vendor_id))) << std::endl;
+
+  // CHECK: 1
+  std::cout << data[5] << std::endl;
+  // CHECK: 0
+  std::cout << data[6] << std::endl;
+
+  sycl::free(data, q);
+}

From 484e3d0c402e2ba119503158dd6fa72157fcea49 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 16 Nov 2024 18:25:33 +0100
Subject: [PATCH 037/126] Fix out-of-bound memory access in test case

---
 tests/compiler/sscp/s2_reflection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/compiler/sscp/s2_reflection.cpp b/tests/compiler/sscp/s2_reflection.cpp
index dd4429017..c9b8fe918 100644
--- a/tests/compiler/sscp/s2_reflection.cpp
+++ b/tests/compiler/sscp/s2_reflection.cpp
@@ -18,7 +18,7 @@ extern "C" bool __acpp_sscp_jit_reflect_knows_random_unknown_thing();
 
 int main() {
   sycl::queue q = get_queue();
-  int* data = sycl::malloc_shared<int>(6, q);
+  int* data = sycl::malloc_shared<int>(7, q);
 
   q.single_task([data]{
     __acpp_if_target_device(

From f09b4ab97aca651446246754976573561d3aeec5 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 16 Nov 2024 18:35:50 +0100
Subject: [PATCH 038/126] Remove old S2 IR constant infrastructure

---
 .../llvm-to-backend/LLVMToBackend.hpp         |  6 --
 include/hipSYCL/glue/llvm-sscp/jit.hpp        |  3 -
 .../hipSYCL/runtime/kernel_configuration.hpp  | 80 -------------------
 .../llvm-to-backend/LLVMToBackend.cpp         |  7 --
 4 files changed, 96 deletions(-)

diff --git a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
index 6e8c71bfc..7499e14f8 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
@@ -50,12 +50,6 @@ class LLVMToBackendTranslator {
 
   virtual ~LLVMToBackendTranslator() {}
 
-  template<class T>
-  void setS2IRConstant(const std::string& name, T value) {
-    setS2IRConstant(name, static_cast<const void*>(&value));
-  }
-
-  void setS2IRConstant(const std::string& name, const void* ValueBuffer);
   void specializeKernelArgument(const std::string &KernelName, int ParamIndex,
                                 const void *ValueBuffer);
   void specializeFunctionCalls(const std::string &FuncName,
diff --git a/include/hipSYCL/glue/llvm-sscp/jit.hpp b/include/hipSYCL/glue/llvm-sscp/jit.hpp
index 2c0c47486..3f552b95b 100644
--- a/include/hipSYCL/glue/llvm-sscp/jit.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/jit.hpp
@@ -238,9 +238,6 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
   runtime_linker configure_linker {translator, imported_symbol_names};
 
   // Apply configuration
-  for(const auto& entry : config.s2_ir_entries()) {
-    translator->setS2IRConstant(entry.get_name(), entry.get_data_buffer());
-  }
   if(translator->getKernels().size() == 1) {
     // Currently we only can specialize kernel arguments for the 
     // single-kernel code object model
diff --git a/include/hipSYCL/runtime/kernel_configuration.hpp b/include/hipSYCL/runtime/kernel_configuration.hpp
index 530a7e375..f4a717914 100644
--- a/include/hipSYCL/runtime/kernel_configuration.hpp
+++ b/include/hipSYCL/runtime/kernel_configuration.hpp
@@ -83,62 +83,6 @@ to_build_flag(const std::string& s);
 
 
 class kernel_configuration {
-
-  class s2_ir_configuration_entry {
-    static constexpr std::size_t buffer_size = 8;
-
-    std::string _name;
-    std::type_index _type;
-    std::array<int8_t, buffer_size> _value;
-    std::size_t _data_size;
-    
-
-    template<class T>
-    void store(const T& val) {
-      static_assert(sizeof(T) <= buffer_size,
-                    "Unsupported kernel configuration value type");
-      for(int i = 0; i < _value.size(); ++i)
-        _value[i] = 0;
-      
-      memcpy(_value.data(), &val, sizeof(val));
-    }
-
-  public:
-    template<class T>
-    s2_ir_configuration_entry(const std::string& name, const T& val)
-    : _name{name}, _type{typeid(T)}, _data_size{sizeof(T)} {
-      store<T>(val);
-    }
-
-    template<class T>
-    T get_value() const {
-      static_assert(sizeof(T) <= buffer_size,
-                    "Unsupported kernel configuration value type");
-      T v;
-      memcpy(&v, _value.data(), sizeof(T));
-      return v;
-    }
-
-    template<class T>
-    bool is_type() const {
-      return _type == typeid(T);
-    }
-
-    const void* get_data_buffer() const {
-      return _value.data();
-    }
-
-    std::size_t get_data_size() const {
-      return _data_size;
-    }
-
-    const std::string& get_name() const {
-      return _name;
-    }
-  };
-
-
-
 public:
   struct int_or_string{
     std::optional<uint64_t> int_value;
@@ -147,18 +91,6 @@ class kernel_configuration {
 
   using id_type = std::array<uint64_t, 2>;
 
-  template<class T>
-  void set_s2_ir_constant(const std::string& config_parameter_name, const T& value) {
-    s2_ir_configuration_entry entry{config_parameter_name, value};
-    for(int i = 0; i < _s2_ir_configurations.size(); ++i) {
-      if(_s2_ir_configurations[i].get_name() == config_parameter_name) {
-        _s2_ir_configurations[i] = entry;
-        return;
-      }
-    }
-    _s2_ir_configurations.push_back(entry);
-  }
-
   void set_specialized_kernel_argument(int param_index, uint64_t buffer_value) {
     for(int i = 0; i < _specialized_kernel_args.size(); ++i) {
       if(_specialized_kernel_args[i].first == param_index) {
@@ -217,12 +149,6 @@ class kernel_configuration {
   id_type generate_id() const {
     id_type result = _base_configuration_result;
 
-    for(const auto& entry : _s2_ir_configurations) {
-      add_entry_to_hash(result, entry.get_name().data(),
-                        entry.get_name().size(), entry.get_data_buffer(),
-                        entry.get_data_size());
-    }
-
     for(const auto& entry : _build_options) {
       uint64_t numeric_option_id = static_cast<uint64_t>(entry.first) | (1ull << 32);
       if(entry.second.int_value.has_value()) {
@@ -258,10 +184,6 @@ class kernel_configuration {
     return result;
   }
 
-  const auto& s2_ir_entries() const {
-    return _s2_ir_configurations;
-  }
-
   const auto& build_options() const {
     return _build_options;
   }
@@ -333,8 +255,6 @@ class kernel_configuration {
     hash[entry_hash % hash.size()] ^= entry_hash;
   }
 
-
-  std::vector<s2_ir_configuration_entry> _s2_ir_configurations;
   std::vector<kernel_build_flag> _build_flags;
   std::vector<std::pair<kernel_build_option, int_or_string>> _build_options;
   std::vector<std::pair<int, uint64_t>> _specialized_kernel_args;
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index 8f2397db1..77758fa2e 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -432,13 +432,6 @@ bool LLVMToBackendTranslator::linkBitcodeFile(llvm::Module &M, const std::string
                            LinkOnlyNeeded);
 }
 
-void LLVMToBackendTranslator::setS2IRConstant(const std::string &name, const void *ValueBuffer) {
-  SpecializationApplicators[name] = [=](llvm::Module& M){
-    S2IRConstant C = S2IRConstant::getFromConstantName(M, name);
-    C.set(ValueBuffer);
-  };
-}
-
 void LLVMToBackendTranslator::specializeKernelArgument(const std::string &KernelName, int ParamIndex,
                                 const void *ValueBuffer) {
   std::string Id = KernelName+"__specialized_kernel_argument_"+std::to_string(ParamIndex);

From 2947db254c1c1100dd6736f0298a2f6a59e17293 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 19 Nov 2024 06:03:09 +0100
Subject: [PATCH 039/126] [SSCP][llvm-to-amdgpu] Adapt amdgcn data layout to
 LLVM 18 changes

---
 src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
index c116e1881..25cdcb7d6 100644
--- a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
+++ b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
@@ -218,7 +218,11 @@ LLVMToAmdgpuTranslator::LLVMToAmdgpuTranslator(const std::vector<std::string> &K
 bool LLVMToAmdgpuTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
   
   M.setTargetTriple(TargetTriple);
-#if LLVM_VERSION_MAJOR >= 17
+#if LLVM_VERSION_MAJOR >= 18
+  M.setDataLayout("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:"
+                  "32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:"
+                  "256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9");
+#elif LLVM_VERSION_MAJOR >= 17
   M.setDataLayout(
       "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-"
       "i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-"

From c07b5023c8c9b7937a29b3b2b2ac754297ed4f18 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 19 Nov 2024 05:44:09 +0100
Subject: [PATCH 040/126] Add high-level C++ API for JIT-time code
 specialization

---
 doc/extensions.md                             | 147 +++++++++++++++++-
 .../algorithms/util/memory_streaming.hpp      |  16 +-
 .../glue/llvm-sscp/jit-reflection/queries.hpp |  65 +++++++-
 include/hipSYCL/sycl/extensions.hpp           |   1 +
 include/hipSYCL/sycl/jit.hpp                  |   6 +-
 .../llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp   |   2 +-
 .../llvm-to-backend/host/LLVMToHost.cpp       |   2 +-
 .../llvm-to-backend/ptx/LLVMToPtx.cpp         |   2 +-
 .../llvm-to-backend/spirv/LLVMToSpirv.cpp     |   2 +-
 tests/compiler/sscp/dynamic_function.cpp      |   8 +-
 tests/compiler/sscp/s2_reflection.cpp         |  20 +++
 11 files changed, 248 insertions(+), 23 deletions(-)

diff --git a/doc/extensions.md b/doc/extensions.md
index 455899b4a..3b8741f3f 100644
--- a/doc/extensions.md
+++ b/doc/extensions.md
@@ -4,6 +4,145 @@ AdaptiveCpp implements several extensions that are not defined by the specificat
 
 ## Supported extensions
 
+### `ACPP_EXT_JIT_COMPILE_IF`
+
+Allows for specializing code based on target properties only known at JIT time. This is only supported with AdaptiveCpp's default generic JIT compiler (`--acpp-targets=generic`).
+If you also want to support other compilation flows, use of the following APIs must
+be guarded using `__acpp_if_target_sscp()`.
+
+#### Example
+```c++
+namespace jit = sycl::AdaptiveCpp_jit;
+
+__acpp_if_target_sscp(
+  jit::compile_if(
+    jit::reflect<jit::reflection_query::target_vendor_id>() == 
+      jit::vendor_id::nvidia,
+    [](){
+      // Will only be included in the JIT-compiled kernel if the target is NVIDIA hardware.
+      // The branching will be evaluated at JIT-time; there will be no runtime overhead
+      // in the generated kernel.
+      //
+      // As such, this mechanism can also be used to guard code that is unsupported or
+      // does not compile correctly on other hardware.
+    });
+);
+
+```
+
+#### API reference
+
+```c++
+namespace sycl::AdaptiveCpp_jit {
+
+/// JIT reflection API
+
+enum class compiler_backend : int {
+  spirv,
+  ptx,
+  amdgpu,
+  host
+};
+
+namespace vendor_id {
+
+// These vendor ids are provided for convenience since
+// they frequently occur; this list is non-exclusive; other
+// vendor_id values might be returned by JIT reflection APIs.
+inline constexpr int nvidia;
+inline constexpr int amd;
+inline constexpr int intel;
+}
+
+///
+/// This namespace defines properties that the JIT compiler can be queried for.
+namespace reflection_query {
+
+/// Vendor id of the target hardware
+/// Return type: int
+struct target_vendor_id;
+
+/// Returns a numeric identifier for the target architecture. For NVIDIA GPUs, this
+/// is the SM architecture (e.g. 86 for sm_86). For AMD GPUs, it is the amdgcn architecture
+/// as an hexadecimal number (e.g. 0x90c for gfx90c).
+/// For other hardware, the query currently returns 0.
+/// Return type: int
+struct target_arch;
+
+/// Returns whether the hardware has independent forward progress for each work item.
+/// Return type: bool
+struct target_has_independent_forward_progress;
+
+/// Returns whether the target is a CPU.
+/// Return type: bool
+struct target_is_cpu;
+
+/// Returns whether the target is a GPU.
+/// Return type: bool
+struct target_is_gpu;
+
+/// Returns the AdaptiveCpp runtime backend that is managing the execution of this kernel.
+/// Return type: int (sycl::backend cast to int)
+struct runtime_backend;
+
+/// Returns the AdaptiveCpp runtime backend that is managing the execution of this kernel.
+/// Return type: compiler_backend
+struct compiler_backend;
+}
+
+/// Evaluates at JIT-time the specified query. Query must be one of the types
+/// defined in AdaptiveCpp_jit::property.
+/// The compiler replaces calls to this function with the return value at JIT-time;
+/// Calls to this function will not remain in the final generated code and not cause runtime
+/// overhead.
+template<class Query>
+auto reflect();
+
+/// Evaluates at JIT-time whether the JIT reflection mechanism supports the specified query.
+/// Currently, all of the queries listed above are supported universally, but in the future
+/// queries might be added that are only supported for certain backends.
+///
+/// Query must be one of the types defined in AdaptiveCpp_jit::property.
+///
+/// The compiler replaces calls to this function with the return value at JIT-time;
+/// Calls to this function will not remain in the final generated code and not cause runtime
+/// overhead.
+template<class Query>
+bool knows();
+
+
+/// Code-generates the callable f only if condition evaluates to true at JIT time.
+///
+/// condition must evaluate to a value known at JIT time, either using compile-time
+/// values or return values from the JIT reflection API.
+///
+/// Because the condition is evaluated at JIT time, no runtime overhead
+/// will be present in the compiled kernel due to branching.
+///
+/// The signature of f is void().
+template<class F>
+void compile_if(bool condition, F&& f);
+
+/// Code-generates the callable if_branch only if condition evaluates to true at JIT time.
+/// Otherwise, the callable else_branch is code-generated.
+///
+/// condition must evaluate to a constant at JIT time, either using compile-time
+/// constants or return values from the JIT reflection API.
+///
+/// Because the condition is evaluated at JIT time, no runtime overhead
+/// will be present in the compiled kernel due to branching.
+///
+/// The signature of if_branch and else_branch is T() for arbitrary types T.
+///
+/// \return If T is not void, compile_if_else() returns the value returned by the
+/// user-provided callable that is invoked.
+template<class F, class G>
+auto compile_if_else(bool condition, F&& if_branch, G&& else_branch);
+
+}
+
+```
+
 ### `ACPP_EXT_DYNAMIC_FUNCTIONS`
 
 This extension allows users to provide functions used in kernels with definitions selected at runtime. We call such functions *dynamic functions*, since their definition will be determined at runtime using the JIT compiler. Once a kernel using dynamic functions has been JIT-compiled, there are no runtime overheads as dynamic functions are hardwired at JIT-time.
@@ -33,7 +172,7 @@ int main() {
   sycl::queue q;
 
   // The dynamic_function_config object stores the JIT-time function mapping information.
-  sycl::jit::dynamic_function_config dyn_function_config;
+  sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
   // Requests calls to execute_operations to be replaced at JIT time
   // with {myfunction1(idx); myfunction2(idx);}
   dyn_function_config.define_as_call_sequence(&execute_operations, {&myfunction1, &myfunction2});
@@ -54,7 +193,7 @@ The AdaptiveCpp runtime maintains a kernel cache that automatically distinguishe
 * It is the user's responsibility to ensure that the `dynamic_function_config` object is kept alive at least until all kernels using it have completed.
 * `dynamic_function_config` is not thread-safe; if one object is shared across multiple threads, it is the user's responsibility to ensure appropriate synchronization.
 * With this extension, the user can exchange kernel code at runtime. This means that in general, the compiler cannot know at compile time anymore which parts of the code need to be part of device code. Therefore, functions  providing the definitions have to be marked as `SYCL_EXTERNAL` to ensure that they are emitted to device code. This can be omitted if the function is invoked from the kernel already at compile time.
-* It is possible to provide a "default definition" for dynamic functions by not just declaring them, but also providing a definition (e.g. in the example above, provide a definition for `execute_operations`). However, in this case, we recommend that the function is marked with `__attribute__((noinline))`. Otherwise, in some cases the compiler might decide to already inline the function early on during the optimization process -- and once, inlined, the JIT compiler no loner sees the function and therefore can no longer find function calls to replace. The `noinline` attribute will have no performance implications once the replacement function definition has been put in place by the JIT compiler. Additionally, if the default function does not actually use the function arguments, the frontend might not actually emit function calls to the dynamic function. It is thus a good idea to use `sycl::jit::arguments_are_used()` to assert that these arguments might e.g. be used by a dynamic function replacement function.
+* It is possible to provide a "default definition" for dynamic functions by not just declaring them, but also providing a definition (e.g. in the example above, provide a definition for `execute_operations`). However, in this case, we recommend that the function is marked with `__attribute__((noinline))`. Otherwise, in some cases the compiler might decide to already inline the function early on during the optimization process -- and once, inlined, the JIT compiler no loner sees the function and therefore can no longer find function calls to replace. The `noinline` attribute will have no performance implications once the replacement function definition has been put in place by the JIT compiler. Additionally, if the default function does not actually use the function arguments, the frontend might not actually emit function calls to the dynamic function. It is thus a good idea to use `sycl::AdaptiveCpp_jit::arguments_are_used()` to assert that these arguments might e.g. be used by a dynamic function replacement function.
 
 With a default function definition, the example above might look like so:
 ```c++
@@ -70,7 +209,7 @@ __attribute__((noinline))
 void execute_operations(int* data, sycl::item<1> idx) {
   // This prevents the compiler from removing calls to execute_operations if it
   // sees that the function cannot actually have any side-effects.
-  sycl::jit::arguments_are_used(data, idx);
+  sycl::AdaptiveCpp_jit::arguments_are_used(data, idx);
 }
 
 int main() {
@@ -78,7 +217,7 @@ int main() {
   int* data = ...;
 
   // The dynamic_function_config object stores the JIT-time function mapping information.
-  sycl::jit::dynamic_function_config dyn_function_config;
+  sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
   // Requests calls to execute_operations to be replaced at JIT time
   // with {myfunction1(idx); myfunction2(idx);}
   // If this is removed, the regular function definition of execute_operations
diff --git a/include/hipSYCL/algorithms/util/memory_streaming.hpp b/include/hipSYCL/algorithms/util/memory_streaming.hpp
index 134be33ce..6caab0f49 100644
--- a/include/hipSYCL/algorithms/util/memory_streaming.hpp
+++ b/include/hipSYCL/algorithms/util/memory_streaming.hpp
@@ -61,15 +61,15 @@ class data_streamer {
   template <class F>
   static void run(std::size_t problem_size, sycl::nd_item<1> idx,
                   F &&f) noexcept {
+    namespace jit = sycl::AdaptiveCpp_jit;
     __acpp_if_target_sscp(
-      if(__acpp_sscp_jit_reflect_compiler_backend() == 
-         sycl::jit::compiler_backend::host) {
-        run_host(problem_size, idx, f);
-      } else {
-        run_device(problem_size, idx, f);
-      }
-      return;
-    );
+        jit::compile_if_else(
+            jit::reflect<jit::reflection_query::compiler_backend>() ==
+              jit::compiler_backend::host,
+            [&]() { run_host(problem_size, idx, f); },
+            [&]() { run_device(problem_size, idx, f); });
+
+        return;);
     __acpp_if_target_device(
       run_device(problem_size, idx, f);
     );
diff --git a/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp b/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
index b2f9f4ed3..f366e725c 100644
--- a/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
@@ -14,7 +14,7 @@
 
 namespace hipsycl{
 namespace sycl {
-namespace jit {
+namespace AdaptiveCpp_jit {
 
 enum class compiler_backend : int {
   spirv = 0,
@@ -51,7 +51,68 @@ extern "C" bool __acpp_sscp_jit_reflect_target_is_cpu();
 extern "C" bool __acpp_sscp_jit_reflect_target_is_gpu();
 extern "C" bool __acpp_sscp_jit_reflect_target_has_independent_forward_progress();
 extern "C" int __acpp_sscp_jit_reflect_runtime_backend();
-extern "C" hipsycl::sycl::jit::compiler_backend __acpp_sscp_jit_reflect_compiler_backend();
+extern "C" hipsycl::sycl::AdaptiveCpp_jit::compiler_backend __acpp_sscp_jit_reflect_compiler_backend();
 
+namespace hipsycl {
+namespace sycl {
+namespace AdaptiveCpp_jit {
+
+
+namespace reflection_query {
+
+#define ACPP_DEFINE_REFLECT_QUERY(name)                                        \
+  struct name {                                                                \
+    __attribute__((always_inline)) static bool is_known() {                    \
+      return __acpp_sscp_jit_reflect_knows_##name();                           \
+    }                                                                          \
+    __attribute__((always_inline)) static auto get() {                         \
+      return __acpp_sscp_jit_reflect_##name();                                 \
+    }                                                                          \
+  };
+
+ACPP_DEFINE_REFLECT_QUERY(target_vendor_id)
+ACPP_DEFINE_REFLECT_QUERY(target_arch)
+ACPP_DEFINE_REFLECT_QUERY(target_has_independent_forward_progress)
+ACPP_DEFINE_REFLECT_QUERY(target_is_cpu)
+ACPP_DEFINE_REFLECT_QUERY(target_is_gpu)
+ACPP_DEFINE_REFLECT_QUERY(runtime_backend)
+ACPP_DEFINE_REFLECT_QUERY(compiler_backend)
+
+#undef ACPP_DEFINE_REFLECT_QUERY
+
+}
+
+template<class Query>
+__attribute__((always_inline))
+auto reflect() {
+  return Query::get();
+}
 
+template<class Query>
+__attribute__((always_inline))
+bool knows() {
+  return Query::is_known();
+}
+
+template<class F>
+__attribute__((always_inline))
+void compile_if(bool condition, F&& f) {
+  if(condition) {
+    f();
+  }
+}
+
+template<class F, class G>
+__attribute__((always_inline))
+auto compile_if_else(bool condition, F&& if_branch, G&& else_branch) {
+  if(condition) {
+    return if_branch();
+  } else{
+    return else_branch();
+  }
+}
+
+}
+}
+}
 #endif
diff --git a/include/hipSYCL/sycl/extensions.hpp b/include/hipSYCL/sycl/extensions.hpp
index c88d284fd..b7399ee40 100644
--- a/include/hipSYCL/sycl/extensions.hpp
+++ b/include/hipSYCL/sycl/extensions.hpp
@@ -74,5 +74,6 @@
 #define ACPP_EXT_QUEUE_PRIORITY
 #define ACPP_EXT_SPECIALIZED
 #define ACPP_EXT_DYNAMIC_FUNCTIONS
+#define ACPP_EXT_JIT_COMPILE_IF
 
 #endif
diff --git a/include/hipSYCL/sycl/jit.hpp b/include/hipSYCL/sycl/jit.hpp
index 912c7c08d..c9b9e686e 100644
--- a/include/hipSYCL/sycl/jit.hpp
+++ b/include/hipSYCL/sycl/jit.hpp
@@ -36,7 +36,7 @@ extern "C" void __acpp_function_annotation_dynamic_function_def_arg1();
 template<class T>
 void __acpp_function_annotation_argument_used(T&& x);
 
-namespace hipsycl::sycl::jit {
+namespace hipsycl::sycl::AdaptiveCpp_jit {
 
 template<class T>
 void arguments_are_used(T&& x) {
@@ -276,6 +276,10 @@ class dynamic_function_config {
 }
 
 
+namespace hipsycl::sycl::jit {
+using namespace hipsycl::sycl::AdaptiveCpp_jit;
+}
+
 #endif // IS_DEVICE_PASS_SSCP
 
 #endif
diff --git a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
index 0ebc5eb73..3b6110d73 100644
--- a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
+++ b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
@@ -209,7 +209,7 @@ class RocmDeviceLibs {
 };
 
 LLVMToAmdgpuTranslator::LLVMToAmdgpuTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::amdgpu), KN, KN},
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::amdgpu), KN, KN},
       KernelNames{KN} {
   RocmDeviceLibsPath = common::filesystem::join_path(RocmPath,
                                                      std::vector<std::string>{"amdgcn", "bitcode"});
diff --git a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
index 10b7eb654..80640df32 100644
--- a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
+++ b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
@@ -59,7 +59,7 @@ namespace hipsycl {
 namespace compiler {
 
 LLVMToHostTranslator::LLVMToHostTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::host), KN, KN},
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::host), KN, KN},
       KernelNames{KN} {}
 
 bool LLVMToHostTranslator::toBackendFlavor(llvm::Module &M, PassHandler &PH) {
diff --git a/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp b/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
index ff1538ab1..74ea4f546 100644
--- a/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
+++ b/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
@@ -155,7 +155,7 @@ void replaceBrokenLLVMIntrinsics(llvm::Module& M) {
 }
 
 LLVMToPtxTranslator::LLVMToPtxTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::ptx), KN, KN},
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::ptx), KN, KN},
       KernelNames{KN} {}
 
 bool LLVMToPtxTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
diff --git a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
index 2a64a3440..e50040bb7 100644
--- a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
+++ b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
@@ -123,7 +123,7 @@ void assignSPIRCallConvention(llvm::Function *F) {
 }
 
 LLVMToSpirvTranslator::LLVMToSpirvTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{static_cast<int>(sycl::jit::compiler_backend::spirv), KN, KN},
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::spirv), KN, KN},
       KernelNames{KN} {}
 
 bool LLVMToSpirvTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
diff --git a/tests/compiler/sscp/dynamic_function.cpp b/tests/compiler/sscp/dynamic_function.cpp
index 076c01f41..0377050f1 100644
--- a/tests/compiler/sscp/dynamic_function.cpp
+++ b/tests/compiler/sscp/dynamic_function.cpp
@@ -21,7 +21,7 @@ SYCL_EXTERNAL void myfunction2(int* data, sycl::item<1> idx) {
 
 __attribute__((noinline))
 void execute_operations_with_definition(int* data, sycl::item<1> idx) {
-  sycl::jit::arguments_are_used(data, idx);
+  sycl::AdaptiveCpp_jit::arguments_are_used(data, idx);
 }
 
 void execute_operations_without_definition(int* data, sycl::item<1> idx);
@@ -34,7 +34,7 @@ int main() {
   {
     *data = 0;
   
-    sycl::jit::dynamic_function_config dyn_function_config;
+    sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
     dyn_function_config.define(&execute_operations_without_definition, &myfunction1);
     q.parallel_for(sycl::range{1}, dyn_function_config.apply([=](sycl::item<1> idx){
       execute_operations_without_definition(data, idx);
@@ -48,7 +48,7 @@ int main() {
   {
     *data = 0;
   
-    sycl::jit::dynamic_function_config dyn_function_config;
+    sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
     dyn_function_config.define(&execute_operations_without_definition, &myfunction1);
     q.parallel_for(sycl::range{1}, dyn_function_config.apply([=](sycl::item<1> idx){
       execute_operations_without_definition(data, idx);
@@ -62,7 +62,7 @@ int main() {
   {
     *data = 0;
   
-    sycl::jit::dynamic_function_config dyn_function_config;
+    sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
     dyn_function_config.define_as_call_sequence(&execute_operations_without_definition, {&myfunction1, &myfunction2});
     q.parallel_for(sycl::range{1}, dyn_function_config.apply([=](sycl::item<1> idx){
       execute_operations_without_definition(data, idx);
diff --git a/tests/compiler/sscp/s2_reflection.cpp b/tests/compiler/sscp/s2_reflection.cpp
index c9b8fe918..92295d99a 100644
--- a/tests/compiler/sscp/s2_reflection.cpp
+++ b/tests/compiler/sscp/s2_reflection.cpp
@@ -64,5 +64,25 @@ int main() {
   // CHECK: 0
   std::cout << data[6] << std::endl;
 
+  q.single_task([=]() {
+    __acpp_if_target_device(
+      auto backend = sycl::AdaptiveCpp_jit::reflect<
+          sycl::AdaptiveCpp_jit::reflection_query::runtime_backend>();
+      data[0] = sycl::AdaptiveCpp_jit::compile_if_else(
+          backend == static_cast<int>(sycl::backend::omp), 
+          []() { return 1; },
+          []() { return 0; });
+      data[1] = sycl::AdaptiveCpp_jit::knows<
+          sycl::AdaptiveCpp_jit::reflection_query::runtime_backend>();
+    );
+  }).wait();
+  // CHECK: 1
+  std::cout << (data[0] == (q.get_device().get_backend() ==
+                            sycl::backend::omp))
+            << std::endl;
+
+  // CHECK: 1
+  std::cout << data[1] << std::endl;
+
   sycl::free(data, q);
 }

From d853114c1fd7b6120beee2b267d948f95b9337e9 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 20 Nov 2024 19:44:42 +0100
Subject: [PATCH 041/126] Add the preliminary default context KHR extension
 (#1590)

* Add default context

* Add test case
---
 include/hipSYCL/runtime/device_id.hpp |  7 +++
 include/hipSYCL/sycl/context.hpp      | 61 +++++++++++++++++++++++++--
 include/hipSYCL/sycl/extensions.hpp   |  4 ++
 include/hipSYCL/sycl/platform.hpp     |  3 +-
 include/hipSYCL/sycl/queue.hpp        | 23 ++++++++--
 tests/sycl/extensions.cpp             | 12 ++++++
 6 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/include/hipSYCL/runtime/device_id.hpp b/include/hipSYCL/runtime/device_id.hpp
index 0a346a388..0f9704862 100644
--- a/include/hipSYCL/runtime/device_id.hpp
+++ b/include/hipSYCL/runtime/device_id.hpp
@@ -14,6 +14,7 @@
 #include <functional>
 #include <cassert>
 #include <ostream>
+#include <cstdint>
 
 namespace hipsycl {
 namespace rt {
@@ -101,6 +102,12 @@ class device_id
   {
     return !(a == b);
   }
+
+  uint64_t hash_code() const {
+    uint32_t backend = static_cast<uint32_t>(_backend.id);
+    uint32_t id = _device_id;
+    return (static_cast<uint64_t>(backend) << 32) | id;
+  }
 private:
   backend_descriptor _backend;
   int _device_id;
diff --git a/include/hipSYCL/sycl/context.hpp b/include/hipSYCL/sycl/context.hpp
index 572719e04..0ef0987c3 100644
--- a/include/hipSYCL/sycl/context.hpp
+++ b/include/hipSYCL/sycl/context.hpp
@@ -32,6 +32,8 @@ class context;
 
 namespace detail {
 const rt::unique_device_list& extract_context_devices(const context&);
+
+struct default_context_tag_t{};
 }
 
 class context
@@ -84,6 +86,38 @@ class context
     _impl->devices.add(detail::get_host_device());
   }
 
+  explicit context(
+      detail::default_context_tag_t,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{handler} {
+    _impl->is_default_context = true;
+  }
+
+  explicit context(
+      detail::default_context_tag_t, const device &dev,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{dev, handler} {
+    _impl->is_default_context = true;
+  }
+
+  explicit context(
+      detail::default_context_tag_t, const platform &plt,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{plt, handler} {
+    _impl->is_default_context = true;
+  }
+
+  explicit context(
+      detail::default_context_tag_t, const std::vector<device> &deviceList,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{deviceList, handler} {
+    _impl->is_default_context = true;
+  }
+
   bool is_host() const {
     bool has_non_host_devices = false;
     _impl->devices.for_each_device([&](rt::device_id d) {
@@ -135,11 +169,26 @@ class context
   }
 
   std::size_t AdaptiveCpp_hash_code() const {
+    if(_impl && _impl->is_default_context) {
+      std::size_t hash = 0;
+      _impl->devices.for_each_device([&](rt::device_id dev){
+        // xor ensures that device order does not matter
+        hash ^= dev.hash_code();
+      });
+      return hash;
+    }
     return std::hash<void*>{}(_impl.get());
   }
 
-  friend bool operator ==(const context& lhs, const context& rhs)
-  { return lhs._impl == rhs._impl; }
+  friend bool operator ==(const context& lhs, const context& rhs) {
+
+    if (lhs._impl && rhs._impl && lhs._impl->is_default_context &&
+        rhs._impl->is_default_context) {
+      return lhs._impl->devices == rhs._impl->devices;
+    }
+
+    return lhs._impl == rhs._impl;
+  }
 
   friend bool operator!=(const context& lhs, const context &rhs)
   { return !(lhs == rhs); }
@@ -148,7 +197,6 @@ class context
     return _impl->requires_runtime.get();
   }
 
-
   [[deprecated("Use AdaptiveCpp_hash_code()")]]
   auto hipSYCL_hash_code() const {
     return AdaptiveCpp_hash_code();
@@ -179,7 +227,8 @@ class context
 
     context_impl() : devices{requires_runtime.get()} {}
 
-    async_handler handler;    
+    async_handler handler;
+    bool is_default_context = false;
   };
 
   std::shared_ptr<context_impl> _impl;
@@ -203,6 +252,10 @@ inline const rt::unique_device_list &extract_context_devices(const context &ctx)
 
 }
 
+inline context platform::khr_get_default_context() const {
+  return context{detail::default_context_tag_t{}, *this};
+}
+
 inline exception::exception(context ctx, std::error_code ec, const std::string& what_arg)
   : _context{std::make_shared<context>(ctx)}, error_code{ec},
     _msg{what_arg} {}
diff --git a/include/hipSYCL/sycl/extensions.hpp b/include/hipSYCL/sycl/extensions.hpp
index b7399ee40..57cbc3c1b 100644
--- a/include/hipSYCL/sycl/extensions.hpp
+++ b/include/hipSYCL/sycl/extensions.hpp
@@ -76,4 +76,8 @@
 #define ACPP_EXT_DYNAMIC_FUNCTIONS
 #define ACPP_EXT_JIT_COMPILE_IF
 
+// KHR extensions
+
+#define SYCL_KHR_DEFAULT_CONTEXT 1
+
 #endif
diff --git a/include/hipSYCL/sycl/platform.hpp b/include/hipSYCL/sycl/platform.hpp
index 9d2046862..153213e2b 100644
--- a/include/hipSYCL/sycl/platform.hpp
+++ b/include/hipSYCL/sycl/platform.hpp
@@ -126,7 +126,8 @@ class platform {
     return AdaptiveCpp_hash_code();
   }
 
-
+  
+  context khr_get_default_context() const;
 private:
   rt::platform_id _platform;
   rt::runtime_keep_alive_token _requires_runtime;
diff --git a/include/hipSYCL/sycl/queue.hpp b/include/hipSYCL/sycl/queue.hpp
index bef9dc022..477978a2b 100644
--- a/include/hipSYCL/sycl/queue.hpp
+++ b/include/hipSYCL/sycl/queue.hpp
@@ -190,11 +190,12 @@ class queue : public detail::property_carrying_object
       : queue{detail::select_devices(deviceSelector), asyncHandler, propList} {}
 
   explicit queue(const device &syclDevice, const property_list &propList = {})
-      : queue{context{syclDevice}, std::vector<device>{syclDevice}, propList} {}
+      : queue{get_default_context(syclDevice), std::vector<device>{syclDevice},
+              propList} {}
 
   explicit queue(const device &syclDevice, const async_handler &asyncHandler,
                  const property_list &propList = {})
-      : queue{context{syclDevice, asyncHandler}, std::vector<device>{syclDevice},
+      : queue{get_default_context(syclDevice), std::vector<device>{syclDevice},
               asyncHandler, propList} {}
 
   template <
@@ -231,10 +232,10 @@ class queue : public detail::property_carrying_object
   explicit queue(const std::vector<device> &devices,
                  const async_handler &handler,
                  const property_list &propList = {})
-      : queue{context{devices, handler}, devices, handler, propList} {}
+      : queue{get_default_context(devices), devices, handler, propList} {}
 
   explicit queue(const std::vector<device>& devices, const property_list& propList = {})
-    : queue{context{devices}, devices, propList} {}
+    : queue{get_default_context(devices), devices, propList} {}
 
   explicit queue(const context &syclContext, const std::vector<device> &devices,
                  const property_list &propList = {})
@@ -1023,6 +1024,20 @@ class queue : public detail::property_carrying_object
     return AdaptiveCpp_inorder_executor();
   }
 private:
+  static context get_default_context(const device& dev) {
+    return context{detail::default_context_tag_t{}, dev.get_platform()};
+  }
+
+  static context get_default_context(const std::vector<device> &devices) {
+    if(devices.empty())
+      return context{detail::default_context_tag_t{}};
+    if(devices.size() == 1){
+      return context{detail::default_context_tag_t{}, devices[0].get_platform()};
+    } else {
+      return context{detail::default_context_tag_t{}, devices};
+    }
+  }
+
   template<int Dim>
   void apply_preferred_group_size(const property_list& prop_list, handler& cgh) {
     if(prop_list.has_property<property::command_group::AdaptiveCpp_prefer_group_size<Dim>>()){
diff --git a/tests/sycl/extensions.cpp b/tests/sycl/extensions.cpp
index 63668ca7c..507bc09b5 100644
--- a/tests/sycl/extensions.cpp
+++ b/tests/sycl/extensions.cpp
@@ -1222,5 +1222,17 @@ BOOST_AUTO_TEST_CASE(sycl_specialized) {
   sycl::free(data, q);
 }
 #endif
+#ifdef SYCL_KHR_DEFAULT_CONTEXT
+BOOST_AUTO_TEST_CASE(khr_default_context) {
+  using namespace cl;
+  sycl::queue q1;
+  sycl::queue q2;
+
+  BOOST_CHECK(q1.get_context() == q2.get_context());
+  BOOST_CHECK(q1.get_device().get_platform().khr_get_default_context() ==
+              q1.get_context());
+  BOOST_CHECK(sycl::context{} != q1.get_context());
+}
+#endif
 
 BOOST_AUTO_TEST_SUITE_END()

From 51afbbf660ccab9735e65100004e5a689a2030f0 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 21 Nov 2024 03:25:37 +0100
Subject: [PATCH 042/126] [SSCP][Adaptivity] Include runtime-detected alignment
 of pointer kernel arguments in JIT-compiled kernels (#1601)

* [SSCP][Adaptivity] Infer alignment of pointer arguments

* Add missing alignment configuration in jit::compiler
---
 .../KnownPtrParamAlignmentOptPass.hpp         | 34 ++++++++++
 .../llvm-to-backend/LLVMToBackend.hpp         |  4 ++
 include/hipSYCL/glue/llvm-sscp/jit.hpp        |  5 ++
 .../hipSYCL/runtime/kernel_configuration.hpp  | 22 +++++++
 src/compiler/llvm-to-backend/CMakeLists.txt   |  1 +
 .../KnownPtrParamAlignmentOptPass.cpp         | 62 +++++++++++++++++++
 .../llvm-to-backend/LLVMToBackend.cpp         | 16 +++++
 src/runtime/adaptivity_engine.cpp             | 27 ++++++++
 8 files changed, 171 insertions(+)
 create mode 100644 include/hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp
 create mode 100644 src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp

diff --git a/include/hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp b/include/hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp
new file mode 100644
index 000000000..2e03629ea
--- /dev/null
+++ b/include/hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp
@@ -0,0 +1,34 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_KNOWN_PTR_PARAM_ALIGNMENT_OPT_PASS_HPP
+#define HIPSYCL_SSCP_KNOWN_PTR_PARAM_ALIGNMENT_OPT_PASS_HPP
+
+#include <llvm/IR/PassManager.h>
+#include <unordered_map>
+
+namespace hipsycl {
+namespace compiler {
+
+class KnownPtrParamAlignmentOptPass : public llvm::PassInfoMixin<KnownPtrParamAlignmentOptPass> {
+public:
+  KnownPtrParamAlignmentOptPass(
+      const std::unordered_map<std::string, std::vector<std::pair<int, int>>> &KnownAlignments);
+  llvm::PreservedAnalyses run(llvm::Module &M,
+                              llvm::ModuleAnalysisManager &MAM);
+private:
+  std::unordered_map<std::string, std::vector<std::pair<int, int>>> KnownPtrParamAlignments;
+};
+
+}
+}
+
+#endif
+
diff --git a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
index 7499e14f8..070dc9adc 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
@@ -56,6 +56,8 @@ class LLVMToBackendTranslator {
                              const std::vector<std::string> &ReplacementCalls,
                              bool OverrideOnlyUndefined=true);
 
+  void setKnownPtrParamAlignment(const std::string &FunctionName, int ParamIndex, int Alignment);
+
   bool setBuildFlag(const std::string &Flag);
   bool setBuildOption(const std::string &Option, const std::string &Value);
   bool setBuildToolArguments(const std::string &ToolName, const std::vector<std::string> &Args);
@@ -233,6 +235,8 @@ class LLVMToBackendTranslator {
 
   std::vector<std::pair<std::string, std::vector<int>*>> FunctionsForDeadArgumentElimination;
 
+  // map from kernel name to list of (param index, alignment)
+  std::unordered_map<std::string, std::vector<std::pair<int, int>>> KnownPtrParamAlignments;
   std::unordered_map<std::string, uint64_t> ReflectionFields;
 
 };
diff --git a/include/hipSYCL/glue/llvm-sscp/jit.hpp b/include/hipSYCL/glue/llvm-sscp/jit.hpp
index 3f552b95b..0e567b47e 100644
--- a/include/hipSYCL/glue/llvm-sscp/jit.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/jit.hpp
@@ -245,6 +245,11 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
       translator->specializeKernelArgument(translator->getKernels().front(),
                                           entry.first, &entry.second);
     }
+
+    for(const auto& entry : config.known_alignments()) {
+      translator->setKnownPtrParamAlignment(translator->getKernels().front(),
+                                            entry.first, entry.second);
+    }
   }
   for(const auto& entry : config.function_call_specialization_config()) {
     auto& config = entry.value->function_call_map;
diff --git a/include/hipSYCL/runtime/kernel_configuration.hpp b/include/hipSYCL/runtime/kernel_configuration.hpp
index f4a717914..de8ab2cdb 100644
--- a/include/hipSYCL/runtime/kernel_configuration.hpp
+++ b/include/hipSYCL/runtime/kernel_configuration.hpp
@@ -129,6 +129,16 @@ class kernel_configuration {
     _build_flags.push_back(flag);
   }
 
+  void set_known_alignment(int param_index, int alignment) {
+    for(auto& entry : _known_alignments) {
+      if(entry.first == param_index) {
+        entry.second = alignment;
+        return;
+      }
+    }
+    _known_alignments.push_back(std::make_pair(param_index, alignment));
+  }
+
   template <class ValueT>
   void append_base_configuration(kernel_base_config_parameter key,
                                  const ValueT &value) {
@@ -181,6 +191,13 @@ class kernel_configuration {
                         &config_id, sizeof(config_id));
     }
 
+    for(const auto& entry : _known_alignments) {
+      uint64_t numeric_option_id = static_cast<uint64_t>(entry.first) | (1ull << 37);
+      uint64_t config_id = entry.second;
+      add_entry_to_hash(result, &numeric_option_id, sizeof(numeric_option_id),
+                        &config_id, sizeof(config_id));
+    }
+
     return result;
   }
 
@@ -200,6 +217,10 @@ class kernel_configuration {
     return _function_call_specializations;
   }
 
+  const auto& known_alignments() const {
+    return _known_alignments;
+  }
+
 private:
   static const void* data_ptr(const char* data) {
     return data_ptr(std::string{data});
@@ -260,6 +281,7 @@ class kernel_configuration {
   std::vector<std::pair<int, uint64_t>> _specialized_kernel_args;
   std::vector<glue::sscp::fcall_config_kernel_property_t>
       _function_call_specializations;
+  std::vector<std::pair<int, int>> _known_alignments;
 
   id_type _base_configuration_result = {};
 };
diff --git a/src/compiler/llvm-to-backend/CMakeLists.txt b/src/compiler/llvm-to-backend/CMakeLists.txt
index ea62f010d..741c2ef55 100644
--- a/src/compiler/llvm-to-backend/CMakeLists.txt
+++ b/src/compiler/llvm-to-backend/CMakeLists.txt
@@ -117,6 +117,7 @@ if(WITH_SSCP_COMPILER)
       LLVMToBackend.cpp 
       AddressSpaceInferencePass.cpp
       KnownGroupSizeOptPass.cpp
+      KnownPtrParamAlignmentOptPass.cpp
       GlobalSizesFitInI32OptPass.cpp
       GlobalInliningAttributorPass.cpp
       DeadArgumentEliminationPass.cpp
diff --git a/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp b/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp
new file mode 100644
index 000000000..177dad17b
--- /dev/null
+++ b/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp
@@ -0,0 +1,62 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp"
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/PassManager.h>
+
+namespace hipsycl {
+namespace compiler {
+
+KnownPtrParamAlignmentOptPass::KnownPtrParamAlignmentOptPass(
+    const std::unordered_map<std::string, std::vector<std::pair<int, int>>> &KnownAlignments)
+    : KnownPtrParamAlignments{KnownAlignments} {}
+
+llvm::PreservedAnalyses KnownPtrParamAlignmentOptPass::run(llvm::Module &M,
+                            llvm::ModuleAnalysisManager &MAM) {
+  llvm::Function *AssumeFunc = llvm::Intrinsic::getDeclaration(&M, llvm::Intrinsic::assume);
+
+  for(auto& Entry : KnownPtrParamAlignments) {
+    if(auto* F = M.getFunction(Entry.first)) {
+      int NumParams = F->getFunctionType()->getNumParams();
+
+      if(!F->isDeclaration()) {
+        for(auto& AlignmentInfo : Entry.second) {
+          int ParamIndex = AlignmentInfo.first;
+          if(ParamIndex < NumParams) {
+            llvm::Value* PtrValue = F->getArg(ParamIndex);
+            llvm::Constant *True = llvm::ConstantInt::get(M.getContext(), llvm::APInt(1, 1));
+            llvm::OperandBundleDef AlignBundle{
+                "align", std::vector<llvm::Value *>{
+                             PtrValue, llvm::ConstantInt::get(
+                                           M.getContext(), llvm::APInt(64, AlignmentInfo.second))}};
+
+            llvm::Instruction *InsertionPoint = &(*F->getEntryBlock().getFirstInsertionPt());
+            llvm::CallInst::Create(
+                llvm::FunctionCallee{AssumeFunc}, llvm::ArrayRef<llvm::Value *>{True},
+                llvm::ArrayRef<llvm::OperandBundleDef>{AlignBundle}, "", InsertionPoint);
+          }
+        }
+      }
+    }
+  }
+
+  return llvm::PreservedAnalyses::none(); 
+}
+
+
+}
+}
+
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index 77758fa2e..b3da6fee4 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -15,6 +15,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/GlobalInliningAttributorPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/KnownGroupSizeOptPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp"
+#include "hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
@@ -281,9 +282,13 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     KnownGroupSizeOptPass GroupSizeOptPass{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ};
     GlobalSizesFitInI32OptPass SizesAsIntOptPass{GlobalSizesFitInInt, KnownGroupSizeX,
                                                  KnownGroupSizeY, KnownGroupSizeZ};
+
     GroupSizeOptPass.run(M, MAM);
     SizesAsIntOptPass.run(M, MAM);
 
+    KnownPtrParamAlignmentOptPass KnownAlignmentOptPass{KnownPtrParamAlignments};
+    KnownAlignmentOptPass.run(M, MAM);
+
     // Before optimizing, make sure everything has internal linkage to
     // help inlining. All linking should have occured by now, except
     // for backend builtin libraries like libdevice etc
@@ -686,6 +691,17 @@ void LLVMToBackendTranslator::runKernelDeadArgumentElimination(
   }
 }
 
+void LLVMToBackendTranslator::setKnownPtrParamAlignment(const std::string &FunctionName,
+                                                        int ParamIndex, int Alignment) {
+  for (auto &Entry : KnownPtrParamAlignments[FunctionName]) {
+    if (Entry.first == ParamIndex) {
+      Entry.second = Alignment;
+      return;
+    }
+  }
+  KnownPtrParamAlignments[FunctionName].push_back(std::make_pair(ParamIndex, Alignment));
+}
+
 void LLVMToBackendTranslator::setReflectionField(const std::string &str, uint64_t value) {
   ReflectionFields[str] = value;
 }
diff --git a/src/runtime/adaptivity_engine.cpp b/src/runtime/adaptivity_engine.cpp
index fbde51a46..0263f1d01 100644
--- a/src/runtime/adaptivity_engine.cpp
+++ b/src/runtime/adaptivity_engine.cpp
@@ -151,6 +151,16 @@ bool is_likely_invariant_argument(common::db::kernel_entry &kernel_entry,
 
   return false;
 }
+
+int determine_ptr_alignment(uint64_t ptrval) {
+#if __has_builtin(__builtin_ctz)
+  int max_alignment = std::min(1 << __builtin_ctz(ptrval), 32);
+  return max_alignment > 4 ? max_alignment : 0;
+#else
+  return 0;
+#endif
+}
+
 }
 
 kernel_adaptivity_engine::kernel_adaptivity_engine(
@@ -221,6 +231,23 @@ kernel_adaptivity_engine::finalize_binary_configuration(
         config.set_specialized_kernel_argument(i, buffer_value);
       }
     }
+
+    // Handle auto alignment specialization
+    for(int i = 0; i < _kernel_info->get_num_parameters(); ++i) {
+      std::size_t arg_size = _kernel_info->get_argument_size(i);
+      if (_kernel_info->get_argument_type(i) == hcf_kernel_info::argument_type::pointer) {
+        uint64_t buffer = 0;
+        std::memcpy(&buffer, _arg_mapper.get_mapped_args()[i],
+                    _kernel_info->get_argument_size(i));
+        int alignment = determine_ptr_alignment(buffer);
+        if(alignment > 0) {
+          HIPSYCL_DEBUG_INFO
+              << "adaptivity_engine: Inferred pointer alignment of "
+              << alignment << " for kernel argument " << i << std::endl;
+          config.set_known_alignment(i, alignment);
+        }
+      }
+    }
   }
   
   if(_adaptivity_level > 1) {

From 8e2e67429cb3bdec617b83b716fd885b28384a2f Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 21 Nov 2024 20:32:04 +0100
Subject: [PATCH 043/126] Account for older GCC versions not supporting
 __has_builtin (#1602)

---
 src/runtime/adaptivity_engine.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/runtime/adaptivity_engine.cpp b/src/runtime/adaptivity_engine.cpp
index 0263f1d01..5db2c8ff6 100644
--- a/src/runtime/adaptivity_engine.cpp
+++ b/src/runtime/adaptivity_engine.cpp
@@ -153,7 +153,18 @@ bool is_likely_invariant_argument(common::db::kernel_entry &kernel_entry,
 }
 
 int determine_ptr_alignment(uint64_t ptrval) {
-#if __has_builtin(__builtin_ctz)
+#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && \
+    !defined(__NVCOMPILER)
+  // gcc supports __builtin_ctz, but versions prior to 10
+  // do not support __has_builtin
+  #define ACPP_HAS_BUILTIN_CTZ
+#else
+  #if __has_builtin(__builtin_ctz)
+    #define ACPP_HAS_BUILTIN_CTZ
+  #endif
+#endif
+
+#ifdef ACPP_HAS_BUILTIN_CTZ
   int max_alignment = std::min(1 << __builtin_ctz(ptrval), 32);
   return max_alignment > 4 ? max_alignment : 0;
 #else

From 0d0e4cc73f8d251fb6f42f9cb2bc9621522c67a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Fri, 22 Nov 2024 11:52:49 +0000
Subject: [PATCH 044/126] add --acpp-dryrun-noplugin flag to acpp

---
 bin/acpp | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/bin/acpp b/bin/acpp
index c9d24077a..d48b6a12d 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -346,6 +346,9 @@ class acpp_config:
       'is-dryrun': option("--acpp-dryrun", "ACPP_DRYRUN", "default-is-dryrun",
 """  If set, only shows compilation commands that would be executed,
   but does not actually execute it. """),
+      'is-dryrun-noplugin': option("--acpp-dryrun-noplugin", "ACPP_DRYRUN_NOPLUGIN", "default-is-dryrun-noplugin",
+"""  If set, only shows compilation commands that would be executed, 
+  but does not actually execute it. This version also remove -fplugin flags."""),
       'is-explicit-multipass': option("--acpp-explicit-multipass", "ACPP_EXPLICIT_MULTIPASS",
       "default-is-explicit-multipass",
 """  If set, executes device passes as separate compiler invocation and lets AdaptiveCpp control embedding device
@@ -801,6 +804,13 @@ class acpp_config:
     except OptionNotSet:
       return False
 
+  @property
+  def is_dryrun_noplugin(self):
+    try:
+      return self._is_flag_set("is-dryrun-noplugin")
+    except OptionNotSet:
+      return False
+      
   @property
   def use_accelerated_cpu(self):
     try:
@@ -925,7 +935,15 @@ class acpp_config:
   def is_pure_linking_stage(self):
     return len(self.source_file_arguments) == 0
 
-def run_or_print(command, print_only):
+def run_or_print(command, print_only, noplugin=False):
+
+  if(noplugin):
+    new_cmd = []
+    for arg in command:
+      if not (arg.startswith("-fplugin=") or arg.startswith("-fpass-plugin")):
+        new_cmd.append(arg)
+    command = new_cmd
+
   if not print_only:
     return subprocess.call(command)
   else:
@@ -1562,7 +1580,8 @@ class compiler:
     self._user_args = config.forwarded_compiler_arguments
     self._requires_linking = config.contains_linking_stage()
     self._requires_compilation = not config.is_pure_linking_stage()
-    self._is_dry_run = config.is_dryrun
+    self._is_dry_run = config.is_dryrun or config.is_dryrun_noplugin
+    self._no_plugins = config.is_dryrun_noplugin
     self._targets = config.targets
     self._common_compiler_args = config.common_compiler_args
     self._acpp_path = config.acpp_installation_path
@@ -1896,7 +1915,7 @@ class compiler:
       args += ld_flags
 
     return run_or_print([compiler_executable] + args,
-                        self._is_dry_run)
+                        self._is_dry_run, self._no_plugins)
 
   def run(self):
     temp_prefix = "adaptivecpp-"

From 3d07de851d3d6a15d8f657c06046495d44aa230b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Fri, 22 Nov 2024 11:57:09 +0000
Subject: [PATCH 045/126] add --acpp-dryrun-noplugin flag to acpp

---
 bin/acpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/acpp b/bin/acpp
index d48b6a12d..ee072c903 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -348,7 +348,7 @@ class acpp_config:
   but does not actually execute it. """),
       'is-dryrun-noplugin': option("--acpp-dryrun-noplugin", "ACPP_DRYRUN_NOPLUGIN", "default-is-dryrun-noplugin",
 """  If set, only shows compilation commands that would be executed, 
-  but does not actually execute it. This version also remove -fplugin flags."""),
+  but does not actually execute it. This version also remove -fplugin related flags."""),
       'is-explicit-multipass': option("--acpp-explicit-multipass", "ACPP_EXPLICIT_MULTIPASS",
       "default-is-explicit-multipass",
 """  If set, executes device passes as separate compiler invocation and lets AdaptiveCpp control embedding device

From 39d058b450965cc8eb7738f078af29aa3636ad77 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 22 Nov 2024 22:54:23 +0100
Subject: [PATCH 046/126] [clang-tidy] Add missing include to
 memory_streaming.hpp (#1606)

---
 include/hipSYCL/algorithms/util/memory_streaming.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hipSYCL/algorithms/util/memory_streaming.hpp b/include/hipSYCL/algorithms/util/memory_streaming.hpp
index 6caab0f49..fba2b5f54 100644
--- a/include/hipSYCL/algorithms/util/memory_streaming.hpp
+++ b/include/hipSYCL/algorithms/util/memory_streaming.hpp
@@ -15,6 +15,7 @@
 #include "hipSYCL/sycl/device.hpp"
 #include "hipSYCL/sycl/libkernel/nd_item.hpp"
 #include "hipSYCL/sycl/info/device.hpp"
+#include "hipSYCL/sycl/jit.hpp"
 #include <cstddef>
 
 

From f0975d9b709922f499273021a1174fd2d0a6f402 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 22 Nov 2024 23:47:34 +0100
Subject: [PATCH 047/126] [NFC][doc] Performance guide: Add section on buffers
 vs USM (#1603)

* [NFC][doc] Performance guide: Add section on buffers vs USM

* [doc] Improve buffers-vs-USM section

* Fix formatting
---
 doc/performance.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/doc/performance.md b/doc/performance.md
index f02a12a2d..5121a5040 100644
--- a/doc/performance.md
+++ b/doc/performance.md
@@ -32,6 +32,41 @@ The other compilation flows `omp`, `cuda`, `hip` should mainly be used when *int
 * If you are unsure, the compilation flags used by clang-based compilers under the hood can be inspected using `<clang invocation> -###`. This also works for AdaptiveCpp's clang-based compilation flows. When in doubt, use this mechanism to align compilation flags between compilers.
 * The compiler invocation that `acpp` generates can be printed and its flags inspected with `--acpp-dryrun`.
 
+## SYCL memory management: USM vs buffers
+
+There are three kinds of unified shared memory (USM) in SYCL:
+
+* host USM (`sycl::malloc_host`). This is device-accessible host memory, similarly to CUDA pinned memory. It is usually only situationally useful, e.g. when the memory is only rarely accessed on GPU and a full data copy might be unnecessary.
+* device USM (`sycl::malloc_device`). This is device-resident memory that is unavailable on the host or other devices. It always stays on that device, and is similar to CUDA's `cudaMalloc`. It provides very low overhead. Explicit data transfers mechanisms need to be invoked by the user to migrate data between host and device. Device USM is usually the most efficient memory for usage on device in SYCL.
+* shared USM (`sycl::malloc_shared`). This is memory that can automatically migrate between host and device, or potentially other devices, similarly to e.g. CUDA's cudaMallocManaged.
+
+AdaptiveCpp supports all forms of USM universally on all backends and supported hardware.
+
+Additionally, SYCL provides the `sycl::buffer`/`sycl::accessor` model.
+
+Generally, SYCL buffers are inferior to USM when it comes to performance:
+* **All types of USM have significantly lower host-side runtime overhead compared to buffers**, and can substantially outperform buffers, especially for short running kernels where submission latencies matter. This is especially true when in-order queues are used (See e.g. this paper for details: https://dl.acm.org/doi/fullHtml/10.1145/3648115.3648120)
+    * With USM, the programmer can express dependencies statically, while the SYCL buffer-accessor model must figure out dependencies at runtime. Similarly, USM allows to statically express allocation/deallocation and data transfers, while with buffers non-trivial mechanisms in the SYCL runtime need to automatically manage these operations. This can add overhead.
+* Buffer accessors are not lightweight objects, and can increase register pressure in kernels compared to USM pointers.
+* Buffers may behave in unexpected ways that can silently introduce performance issues, for example buffer destructors synchronize in certain cases to wait for work to complete.
+
+For shared USM specifically:
+* Performance of shared USM typically depends on memory access patterns and driver quality. Depending on the operating system and hardware, very good performance is also possible with shared USM.
+  * On CPU, shared USM is identical to device USM by design, and consequently there is no performance overhead
+  * Performance on NVIDIA GPUs is typically excellent
+  * On Intel GPUs, performance is typically good, depending on memory access patterns. On dedicated Intel GPUs, note that current hardware and drivers do not support data migration at page granularity, i.e. always entire allocations will be migrated at a time if data is accessed on host/device. This is not an issue on iGPU.
+  * On AMD GPUs, performance can be good, but for some driver/OS/hardware setups may be substantially degraded if the XNACK hardware feature is not available.
+* Performance of shared USM can often be improved using the `queue::prefetch()` performance hint.
+
+
+Shared USM is the most productive memory management model that SYCL has, and can be a great solution for e.g. rapid prototyping or porting CPU code. **Shared USM is also less verbose and more productive than using SYCL buffers!**
+
+
+In summary:
+* **When control and maximum performance is needed, use device USM (`sycl::malloc_device`)**
+* **When maximum productivity is needed, use shared USM (`sycl::malloc_shared`)**
+* **Never use buffers. They do not bring significant advantages compared to USM, but can introduce substantial drawbacks!**
+
 ## Ahead-of-time vs JIT compilation
 
 The compilation targets `omp`, `hip` and `cuda` perform ahead-of-time compilation. This means they depend strongly on the user to provide correct optimization flags when compiling.

From 2ce072baf9cc57d85cf4a3f6e0baf759162c1f2a Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 22 Nov 2024 23:48:22 +0100
Subject: [PATCH 048/126] [SSCP][Adaptivity] Improve alignment handling (#1604)

---
 src/runtime/adaptivity_engine.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/runtime/adaptivity_engine.cpp b/src/runtime/adaptivity_engine.cpp
index 5db2c8ff6..1fd006f53 100644
--- a/src/runtime/adaptivity_engine.cpp
+++ b/src/runtime/adaptivity_engine.cpp
@@ -153,6 +153,9 @@ bool is_likely_invariant_argument(common::db::kernel_entry &kernel_entry,
 }
 
 int determine_ptr_alignment(uint64_t ptrval) {
+  if(ptrval == 0)
+    return 0;
+  
 #if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && \
     !defined(__NVCOMPILER)
   // gcc supports __builtin_ctz, but versions prior to 10
@@ -165,8 +168,8 @@ int determine_ptr_alignment(uint64_t ptrval) {
 #endif
 
 #ifdef ACPP_HAS_BUILTIN_CTZ
-  int max_alignment = std::min(1 << __builtin_ctz(ptrval), 32);
-  return max_alignment > 4 ? max_alignment : 0;
+  uint64_t alignment = 1ull << __builtin_ctz(ptrval);
+  return alignment >= 32 ? 32 : 0;
 #else
   return 0;
 #endif

From cad14c871a13d5c0dd63bf729dd8b2d0d9978bca Mon Sep 17 00:00:00 2001
From: VaiTon <eyadlorenzo@gmail.com>
Date: Tue, 26 Nov 2024 16:31:25 +0100
Subject: [PATCH 049/126] Add missing llvm/IR/Module.h header in
 KnownPtrParamAlignmentOptPass.cpp (#1615)

---
 src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp b/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp
index 177dad17b..43b3ba05d 100644
--- a/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp
+++ b/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp
@@ -15,6 +15,7 @@
 #include <llvm/IR/Instruction.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
 #include <llvm/IR/PassManager.h>
 
 namespace hipsycl {

From 049d18b9be277d8f8c72bf84cb452d79d4f02c92 Mon Sep 17 00:00:00 2001
From: VaiTon <eyadlorenzo@gmail.com>
Date: Thu, 28 Nov 2024 16:37:40 +0100
Subject: [PATCH 050/126] Invert if condition in omp_queue.cpp (#1612)

---
 src/runtime/omp/omp_queue.cpp | 170 +++++++++++++++++-----------------
 1 file changed, 84 insertions(+), 86 deletions(-)

diff --git a/src/runtime/omp/omp_queue.cpp b/src/runtime/omp/omp_queue.cpp
index 0b2a94687..e5637f32b 100644
--- a/src/runtime/omp/omp_queue.cpp
+++ b/src/runtime/omp/omp_queue.cpp
@@ -267,90 +267,7 @@ result omp_queue::submit_memcpy(memcpy_operation &op, const dag_node_ptr& node)
   HIPSYCL_DEBUG_INFO << "omp_queue: Submitting memcpy operation..."
                      << std::endl;
 
-  if (op.source().get_device().is_host() && op.dest().get_device().is_host()) {
-
-    void *base_src = op.source().get_base_ptr();
-    void *base_dest = op.dest().get_base_ptr();
-
-    assert(base_src);
-    assert(base_dest);
-
-    range<3> transferred_range = op.get_num_transferred_elements();
-    range<3> src_allocation_shape = op.source().get_allocation_shape();
-    range<3> dest_allocation_shape = op.dest().get_allocation_shape();
-    id<3> src_offset = op.source().get_access_offset();
-    id<3> dest_offset = op.dest().get_access_offset();
-    std::size_t src_element_size = op.source().get_element_size();
-    std::size_t dest_element_size = op.dest().get_element_size();
-
-    std::size_t total_num_bytes = op.get_num_transferred_bytes();
-
-    bool is_src_contiguous =
-        is_contigous(src_offset, transferred_range, src_allocation_shape);
-    bool is_dest_contiguous =
-        is_contigous(dest_offset, transferred_range, dest_allocation_shape);
-
-    omp_instrumentation_setup instrumentation_setup{op, node};
-
-    _worker([=]() {
-      auto instrumentation_guard = instrumentation_setup.instrument_task();
-
-      auto linear_index = [](id<3> id, range<3> allocation_shape) {
-        return id[2] + allocation_shape[2] * id[1] +
-               allocation_shape[2] * allocation_shape[1] * id[0];
-      };
-
-      if (is_src_contiguous && is_dest_contiguous) {
-        char *current_src = reinterpret_cast<char *>(base_src);
-        char *current_dest = reinterpret_cast<char *>(base_dest);
-
-        current_src +=
-            linear_index(src_offset, src_allocation_shape) * src_element_size;
-        current_dest += linear_index(dest_offset, dest_allocation_shape) *
-                        dest_element_size;
-
-        memcpy(current_dest, current_src, total_num_bytes);
-      } else {
-        id<3> current_src_offset = src_offset;
-        id<3> current_dest_offset = dest_offset;
-        std::size_t row_size = transferred_range[2] * src_element_size;
-
-        for (std::size_t surface = 0; surface < transferred_range[0];
-             ++surface) {
-          for (std::size_t row = 0; row < transferred_range[1]; ++row) {
-
-            char *current_src = reinterpret_cast<char *>(base_src);
-            char *current_dest = reinterpret_cast<char *>(base_dest);
-
-            current_src +=
-                linear_index(current_src_offset, src_allocation_shape) *
-                src_element_size;
-
-            current_dest +=
-                linear_index(current_dest_offset, dest_allocation_shape) *
-                dest_element_size;
-
-            assert(current_src + row_size <=
-                   reinterpret_cast<char *>(base_src) +
-                       src_allocation_shape.size() * src_element_size);
-            assert(current_dest + row_size <=
-                   reinterpret_cast<char *>(base_dest) +
-                       dest_allocation_shape.size() * dest_element_size);
-
-            memcpy(current_dest, current_src, row_size);
-
-            ++current_src_offset[1];
-            ++current_dest_offset[1];
-          }
-          current_src_offset[1] = src_offset[1];
-          current_dest_offset[1] = dest_offset[1];
-
-          ++current_dest_offset[0];
-          ++current_src_offset[0];
-        }
-      }
-    });
-  } else {
+  if (!op.source().get_device().is_host() || !op.dest().get_device().is_host()) {
     return register_error(
         __acpp_here(),
         error_info{"omp_queue: OpenMP CPU backend cannot transfer data between "
@@ -358,6 +275,87 @@ result omp_queue::submit_memcpy(memcpy_operation &op, const dag_node_ptr& node)
                    error_type::feature_not_supported});
   }
 
+  void *base_src = op.source().get_base_ptr();
+  void *base_dest = op.dest().get_base_ptr();
+
+  assert(base_src);
+  assert(base_dest);
+
+  range<3> transferred_range = op.get_num_transferred_elements();
+  range<3> src_allocation_shape = op.source().get_allocation_shape();
+  range<3> dest_allocation_shape = op.dest().get_allocation_shape();
+  id<3> src_offset = op.source().get_access_offset();
+  id<3> dest_offset = op.dest().get_access_offset();
+  std::size_t src_element_size = op.source().get_element_size();
+  std::size_t dest_element_size = op.dest().get_element_size();
+
+  std::size_t total_num_bytes = op.get_num_transferred_bytes();
+
+  bool is_src_contiguous =
+      is_contigous(src_offset, transferred_range, src_allocation_shape);
+  bool is_dest_contiguous =
+      is_contigous(dest_offset, transferred_range, dest_allocation_shape);
+
+  omp_instrumentation_setup instrumentation_setup{op, node};
+
+  _worker([=]() {
+    auto instrumentation_guard = instrumentation_setup.instrument_task();
+
+    auto linear_index = [](id<3> id, range<3> allocation_shape) {
+      return id[2] + allocation_shape[2] * id[1] +
+             allocation_shape[2] * allocation_shape[1] * id[0];
+    };
+
+    if (is_src_contiguous && is_dest_contiguous) {
+      char *current_src = reinterpret_cast<char *>(base_src);
+      char *current_dest = reinterpret_cast<char *>(base_dest);
+
+      current_src +=
+          linear_index(src_offset, src_allocation_shape) * src_element_size;
+      current_dest +=
+          linear_index(dest_offset, dest_allocation_shape) * dest_element_size;
+
+      memcpy(current_dest, current_src, total_num_bytes);
+    } else {
+      id<3> current_src_offset = src_offset;
+      id<3> current_dest_offset = dest_offset;
+      std::size_t row_size = transferred_range[2] * src_element_size;
+
+      for (std::size_t surface = 0; surface < transferred_range[0]; ++surface) {
+        for (std::size_t row = 0; row < transferred_range[1]; ++row) {
+
+          char *current_src = reinterpret_cast<char *>(base_src);
+          char *current_dest = reinterpret_cast<char *>(base_dest);
+
+          current_src +=
+              linear_index(current_src_offset, src_allocation_shape) *
+              src_element_size;
+
+          current_dest +=
+              linear_index(current_dest_offset, dest_allocation_shape) *
+              dest_element_size;
+
+          assert(current_src + row_size <=
+                 reinterpret_cast<char *>(base_src) +
+                     src_allocation_shape.size() * src_element_size);
+          assert(current_dest + row_size <=
+                 reinterpret_cast<char *>(base_dest) +
+                     dest_allocation_shape.size() * dest_element_size);
+
+          memcpy(current_dest, current_src, row_size);
+
+          ++current_src_offset[1];
+          ++current_dest_offset[1];
+        }
+        current_src_offset[1] = src_offset[1];
+        current_dest_offset[1] = dest_offset[1];
+
+        ++current_dest_offset[0];
+        ++current_src_offset[0];
+      }
+    }
+  });
+
   return make_success();
 }
 
@@ -369,7 +367,7 @@ result omp_queue::submit_kernel(kernel_operation &op, const dag_node_ptr& node)
 
   const kernel_configuration *config =
       &(op.get_launcher().get_kernel_configuration());
-  
+
   auto backend_id = _backend_id;
   void* params = this;
   rt::dag_node* node_ptr = node.get();
@@ -417,7 +415,7 @@ result omp_queue::submit_sscp_kernel_from_code_object(
       group_size, args,        arg_sizes,   num_args, local_mem_size};
 
   _config = initial_config;
-  
+
   _config.append_base_configuration(
       kernel_base_config_parameter::backend_id, backend_id::omp);
   _config.append_base_configuration(

From e5dd56e9317b51e495d1ccfd9568c479e8d6560f Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 28 Nov 2024 16:39:19 +0100
Subject: [PATCH 051/126] Rewrite vector add example using device USM (#1619)

---
 doc/examples.md | 49 +++++++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/doc/examples.md b/doc/examples.md
index 79b64fe88..93d14b69c 100644
--- a/doc/examples.md
+++ b/doc/examples.md
@@ -3,41 +3,42 @@ The following code adds two vectors:
 #include <cassert>
 #include <iostream>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 using data_type = float;
 
-std::vector<data_type> add(cl::sycl::queue& q,
+std::vector<data_type> add(sycl::queue& q,
                            const std::vector<data_type>& a,
-                           const std::vector<data_type>& b)
-{
+                           const std::vector<data_type>& b) {
   std::vector<data_type> c(a.size());
 
   assert(a.size() == b.size());
-  cl::sycl::range<1> work_items{a.size()};
-
-  {
-    cl::sycl::buffer<data_type> buff_a(a.data(), a.size());
-    cl::sycl::buffer<data_type> buff_b(b.data(), b.size());
-    cl::sycl::buffer<data_type> buff_c(c.data(), c.size());
-
-    q.submit([&](cl::sycl::handler& cgh){
-      auto access_a = buff_a.get_access<cl::sycl::access::mode::read>(cgh);
-      auto access_b = buff_b.get_access<cl::sycl::access::mode::read>(cgh);
-      auto access_c = buff_c.get_access<cl::sycl::access::mode::write>(cgh);
-
-      cgh.parallel_for<class vector_add>(work_items,
-                                         [=] (cl::sycl::id<1> tid) {
-        access_c[tid] = access_a[tid] + access_b[tid];
-      });
-    });
-  }
+
+  data_type* dev_a = sycl::malloc_device<data_type>(a.size(), q);
+  data_type* dev_b = sycl::malloc_device<data_type>(a.size(), q);
+  data_type* dev_c = sycl::malloc_device<data_type>(a.size(), q);
+
+  q.memcpy(dev_a, a.data(), sizeof(T) * a.size());
+  q.memcpy(dev_b, b.data(), sizeof(T) * b.size());
+  q.memcpy(dev_c, c.data(), sizeof(T) * c.size());
+
+  q.parallel_for(a.size(), [=](sycl::id<1> idx){
+    dev_c[idx] = dev_a[idx] + dev_b[idx];
+  });
+
+  q.memcpy(c.data(), dev_c, sizeof(T) * c.size());
+  q.wait();
+
+  sycl::free(dev_a, q);
+  sycl::free(dev_b, q);
+  sycl::free(dev_c, q);
+
   return c;
 }
 
 int main()
 {
-  cl::sycl::queue q;
+  sycl::queue q{sycl::property::queue::in_order{}};
   std::vector<data_type> a = {1.f, 2.f, 3.f, 4.f, 5.f};
   std::vector<data_type> b = {-1.f, 2.f, -3.f, 4.f, -5.f};
   auto result = add(q, a, b);
@@ -47,4 +48,4 @@ int main()
     std::cout << x << std::endl;
 }
 
-```
\ No newline at end of file
+```

From 1c4c391dc8ee7d8cb6ed779030d6fd044f33bd3a Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 28 Nov 2024 16:39:54 +0100
Subject: [PATCH 052/126] Also refer to performance guide in acpp --acpp-help
 (#1617)

---
 bin/acpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bin/acpp b/bin/acpp
index c9d24077a..f806c7001 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -1942,6 +1942,8 @@ def print_usage(config):
   print("--help\n  Print this help message\n")
   print("\nAny other options will be forwarded to the compiler.")
   print("\nNote: Command line arguments take precedence over environment variables.")
+  print("\n\nFor guidance on how to get good performance with AdaptiveCpp, please see")
+  print("\nhttps://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/performance.md")
 
 if __name__ == '__main__':
   if sys.version_info[0] < 3:

From 0df25f8e352ca31a234f91a692586de0dced9d22 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 28 Nov 2024 16:40:14 +0100
Subject: [PATCH 053/126] Warn if buffers are used (#1618)

---
 include/hipSYCL/sycl/buffer.hpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/hipSYCL/sycl/buffer.hpp b/include/hipSYCL/sycl/buffer.hpp
index 0fa83271d..51c8679db 100644
--- a/include/hipSYCL/sycl/buffer.hpp
+++ b/include/hipSYCL/sycl/buffer.hpp
@@ -20,6 +20,7 @@
 #include <type_traits>
 #include <algorithm>
 #include <utility>
+#include <atomic>
 
 #include "hipSYCL/common/debug.hpp"
 #include "hipSYCL/runtime/allocator.hpp"
@@ -190,6 +191,21 @@ struct buffer_impl
   bool destructor_waits;
   bool use_external_storage;
 
+  buffer_impl() {
+    static std::atomic<bool> was_warning_emitted = false;
+    if(!was_warning_emitted) {
+      HIPSYCL_DEBUG_WARNING << "This application uses SYCL buffers; the SYCL "
+	      "buffer-accessor model is well-known to introduce unnecessary "
+	      "overheads. Please consider migrating to the SYCL2020 USM model, "
+	      "in particular device USM (sycl::malloc_device) combined with "
+	      "in-order queues for more performance. See the AdaptiveCpp "
+	      "performance guide for more information: \n"
+	      "https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/performance.md"
+	<< std::endl;
+      was_warning_emitted = true;
+    }
+  }
+
   ~buffer_impl() {
     if (writes_back) {
       if (!writeback_ptr) {

From 5aa268a7555729843f680664248794e2e3177356 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 28 Nov 2024 16:47:06 +0100
Subject: [PATCH 054/126] [SSCP][Adaptivity] Add support for
 AdaptiveCpp_restrict_ptr and automatic inference of noalias semantics (#1593)

* Add support for AdaptiveCpp_restrict_ptr extension

* Migrate core stdpar allocation map data structure to common

* Add RSD optimization at higher adaptivity levels

* Rename restrict enums to noalias to avoid compilation issues on nvc++ where restrict is keyword

* Add ACPP_ENABLE_ALLOCATION_TRACKING env variable
---
 doc/env_variables.md                          |   3 +-
 doc/extensions.md                             |  20 +
 doc/performance.md                            |   6 +
 .../algorithms/util/allocation_cache.hpp      |  12 +-
 include/hipSYCL/common/allocation_map.hpp     | 496 ++++++++++++++++++
 .../llvm-to-backend/LLVMToBackend.hpp         |   3 +-
 include/hipSYCL/glue/llvm-sscp/jit.hpp        |   6 +
 .../hipSYCL/runtime/allocation_tracker.hpp    |  33 ++
 include/hipSYCL/runtime/allocator.hpp         |  30 +-
 include/hipSYCL/runtime/application.hpp       |   3 +
 .../hipSYCL/runtime/cuda/cuda_allocator.hpp   |  12 +-
 include/hipSYCL/runtime/data.hpp              |   2 +-
 include/hipSYCL/runtime/hip/hip_allocator.hpp |  12 +-
 include/hipSYCL/runtime/kernel_cache.hpp      |   3 +-
 .../hipSYCL/runtime/kernel_configuration.hpp  |  34 ++
 include/hipSYCL/runtime/ocl/ocl_allocator.hpp |  12 +-
 include/hipSYCL/runtime/omp/omp_allocator.hpp |  12 +-
 .../runtime/runtime_event_handlers.hpp        |  48 ++
 include/hipSYCL/runtime/settings.hpp          |   9 +-
 include/hipSYCL/runtime/ze/ze_allocator.hpp   |  17 +-
 .../std/stdpar/detail/allocation_map.hpp      | 477 +----------------
 .../hipSYCL/std/stdpar/detail/sycl_glue.hpp   |  31 +-
 include/hipSYCL/sycl/buffer.hpp               |  21 +-
 include/hipSYCL/sycl/extensions.hpp           |   1 +
 include/hipSYCL/sycl/libkernel/restrict.hpp   |  69 +++
 include/hipSYCL/sycl/usm.hpp                  |  17 +-
 .../llvm-to-backend/LLVMToBackend.cpp         |  18 +
 src/runtime/CMakeLists.txt                    |   3 +
 src/runtime/adaptivity_engine.cpp             |  71 +++
 src/runtime/allocation_tracker.cpp            |  46 ++
 src/runtime/allocator.cpp                     |  61 +++
 src/runtime/application.cpp                   |   5 +
 src/runtime/cuda/cuda_allocator.cpp           |  14 +-
 src/runtime/dag_direct_scheduler.cpp          |   2 +-
 src/runtime/hip/hip_allocator.cpp             |  14 +-
 src/runtime/kernel_cache.cpp                  |   2 +
 src/runtime/ocl/ocl_allocator.cpp             |  16 +-
 src/runtime/ocl/ocl_hardware_manager.cpp      |   5 +-
 src/runtime/omp/omp_allocator.cpp             |  16 +-
 src/runtime/runtime_event_handlers.cpp        |  42 ++
 src/runtime/ze/ze_allocator.cpp               |  24 +-
 src/runtime/ze/ze_backend.cpp                 |   2 +-
 tests/pstl/allocation_map.cpp                 |   3 +-
 tests/pstl/free_space_map.cpp                 |   4 +-
 44 files changed, 1181 insertions(+), 556 deletions(-)
 create mode 100644 include/hipSYCL/common/allocation_map.hpp
 create mode 100644 include/hipSYCL/runtime/allocation_tracker.hpp
 create mode 100644 include/hipSYCL/runtime/runtime_event_handlers.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/restrict.hpp
 create mode 100644 src/runtime/allocation_tracker.cpp
 create mode 100644 src/runtime/allocator.cpp
 create mode 100644 src/runtime/runtime_event_handlers.cpp

diff --git a/doc/env_variables.md b/doc/env_variables.md
index 714cc2e8c..62e17a4d3 100644
--- a/doc/env_variables.md
+++ b/doc/env_variables.md
@@ -31,4 +31,5 @@
 * `ACPP_APPDB_DIR`: By default, AdaptiveCpp stores its application db (which in particular includes the per-app JIT cache) in `$HOME/.acpp`. This environment variable can be used to override the location.
 * `ACPP_JITOPT_IADS_RELATIVE_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): When the same argument has been passed into the kernel for this fraction of all invocations of the kernel, a new kernel will be JIT-compiled with the argument value hard-wired as constant. Not taken into account for the first application run. Default: 0.8.
 * `ACPP_JITOPT_IADS_RELATIVE_THRESHOLD_MIN_DATA`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): Only consider kernels with at least many invocations for the relative threshold described above. Default: 1024.
-* `ACPP_JITOPT_IADS_RELATIVE_EVICTION_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): If the relative frequency of a kernel argument value falls below this threshold, the statistics entry for the the argument value may be evicted if space for other values is needed.
\ No newline at end of file
+* `ACPP_JITOPT_IADS_RELATIVE_EVICTION_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): If the relative frequency of a kernel argument value falls below this threshold, the statistics entry for the the argument value may be evicted if space for other values is needed.
+* `ACPP_ENABLE_ALLOCATION_TRACKING`: If set to 1, allows the AdaptiveCpp runtime to track and register the allocations that it manages. This enables additional JIT-time optimizations. Set to 0 to disable. (Default: 0)
diff --git a/doc/extensions.md b/doc/extensions.md
index 3b8741f3f..726d952b8 100644
--- a/doc/extensions.md
+++ b/doc/extensions.md
@@ -4,6 +4,26 @@ AdaptiveCpp implements several extensions that are not defined by the specificat
 
 ## Supported extensions
 
+### `ACPP_EXT_RESTRICT_PTR`
+
+Provides a wrapper type that hints to the compiler that a pointer kernel argument does not alias other pointer arguments.
+*Note:* This currently only has an effect with AdaptiveCpp's generic JIT compiler (`--acpp-targets=generic`), other compilation flows ignore this hint.
+
+Example:
+
+```c++
+
+sycl::queue q;
+float* data = ...
+sycl::AdaptiveCpp_restrict_ptr<float> restrict_data = data;
+
+q.parallel_for(range, [=](auto idx){
+  // Converts implicitly to the underlying pointer type - float* in this
+  // example.
+  restrict_data[idx] *= 1.5f;
+});
+```
+
 ### `ACPP_EXT_JIT_COMPILE_IF`
 
 Allows for specializing code based on target properties only known at JIT time. This is only supported with AdaptiveCpp's default generic JIT compiler (`--acpp-targets=generic`).
diff --git a/doc/performance.md b/doc/performance.md
index 5121a5040..03b613c1c 100644
--- a/doc/performance.md
+++ b/doc/performance.md
@@ -89,6 +89,8 @@ This optimization process is complete when the following warning is no longer pr
 
 The extent of this can be controlled using the environment variable `ACPP_ADAPTIVITY_LEVEL`. A value of 0 disables the feature. The default is 1. Higher levels are expected to result in higher peak performance, but may require more application runs to converge to this performance. The default level of 1 usually guarantees peak performance for the second application run.
 
+Setting `ACPP_ENABLE_ALLOCATION_TRACKING=1` enables additional optimizations at adaptivity level 1.
+
 At adaptivity level >= 2, AdaptiveCpp will enable additional, aggressive optimizations.
 In particular, AdaptiveCpp will attempt to detect invariant kernel arguments, and hardwire those as constants during JIT time. In some cases, this can result in substantial performance increases. It is thus advisable to try setting `ACPP_ADAPTIVITY_LEVEL=2` and running the application a couple of times (typically 3-4 times).
 
@@ -96,6 +98,10 @@ Note: Applications that are highly latency-sensitive may notice a slightly incre
 
 **For peak performance, you should not disable adaptivity, and run the application until the warning above is no longer printed.**
 
+We recommend:
+* Experiment with `ACPP_ADAPTIVITY_LEVEL=1` and `ACPP_ADAPTIVITY_LEVEL=2`
+* Experiment with `ACPP_ENABLE_ALLOCATION_TRACKING=1` and `ACPP_ENABLE_ALLOCATION_TRACKING=0`.
+
 *Note: Adaptivity levels higher than 2 are currently not implemented.*
 
 ### Empty the kernel cache when upgrading the stack
diff --git a/include/hipSYCL/algorithms/util/allocation_cache.hpp b/include/hipSYCL/algorithms/util/allocation_cache.hpp
index 257db32bd..b92ed32c7 100644
--- a/include/hipSYCL/algorithms/util/allocation_cache.hpp
+++ b/include/hipSYCL/algorithms/util/allocation_cache.hpp
@@ -53,10 +53,10 @@ class allocation_cache {
     std::lock_guard<std::mutex> lock{_mutex};
     
     for(auto& allocation : _allocations) {
-      _rt.get()->backends()
+      auto* allocator = _rt.get()->backends()
           .get(allocation.dev.get_backend())
-          ->get_allocator(allocation.dev)
-          ->free(allocation.ptr);
+          ->get_allocator(allocation.dev);
+      rt::deallocate(allocator, allocation.ptr);
     }
     _allocations.clear();
   }
@@ -74,12 +74,12 @@ class allocation_cache {
                        ->get_allocator(dev);
 
       if(_alloc_type == allocation_type::device)
-        result.ptr = allocator->allocate(min_alignment, min_size);
+        result.ptr = rt::allocate_device(allocator, min_alignment, min_size);
       else if(_alloc_type == allocation_type::shared)
-        result.ptr = allocator->allocate_usm(min_size);
+        result.ptr = rt::allocate_shared(allocator, min_size);
       else
         result.ptr =
-            allocator->allocate_optimized_host(min_alignment, min_size);
+            rt::allocate_host(allocator, min_alignment, min_size);
     }
     return result;
   }
diff --git a/include/hipSYCL/common/allocation_map.hpp b/include/hipSYCL/common/allocation_map.hpp
new file mode 100644
index 000000000..90360428e
--- /dev/null
+++ b/include/hipSYCL/common/allocation_map.hpp
@@ -0,0 +1,496 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_ALLOCATION_MAP_HPP
+#define ACPP_ALLOCATION_MAP_HPP
+
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdint>
+#include <type_traits>
+#include <atomic>
+#include <algorithm>
+#include <set>
+#include <array>
+#include <cassert>
+
+
+namespace hipsycl::common {
+
+struct stdlib_untyped_allocator {
+  static void* allocate(size_t n) {
+    return std::malloc(n);
+  }
+
+  static void deallocate(void* ptr) {
+    std::free(ptr);
+  }
+};
+
+template<class UntypedAllocatorT, class Int_type, int... Bit_sizes>
+class bit_tree {
+protected:
+  bit_tree(){}
+  
+  static constexpr int num_levels = sizeof...(Bit_sizes);
+  static constexpr int root_level_idx = num_levels - 1;
+  static constexpr int bitsizes[num_levels] = {Bit_sizes...};
+
+  static constexpr int get_num_entries_in_level(int level) {
+    return 1ull << bitsizes[level];
+  }
+
+  static constexpr int get_bitoffset_in_level(int level) {
+    int result = 0;
+    for(int i = 0; i < level; ++i) {
+      result += bitsizes[i];
+    }
+    return result;
+  }
+
+  static constexpr int get_index_in_level(Int_type address, int level) {
+    Int_type bitmask = get_n_low_bits_set(bitsizes[level]);
+    return (address >> get_bitoffset_in_level(level)) & bitmask;
+  }
+
+  static constexpr uint64_t get_n_low_bits_set(int n) {
+    if(n == 64)
+      return ~0ull;
+    return (1ull << n) - 1;
+  }
+
+  static constexpr uint64_t get_space_spanned_by_node_in_level(int level) {
+    uint64_t result = 1;
+    for(int i = 0; i < level; ++i)
+      result *= get_num_entries_in_level(level);
+    return result;
+  }
+
+  template<class T>
+  static T* alloc(int count) {
+    return static_cast<T*>(UntypedAllocatorT::allocate(sizeof(T) * count));
+  }
+
+  static void free(void* ptr) {
+    UntypedAllocatorT::deallocate(ptr);
+  }
+};
+
+template<class UntypedAllocatorT>
+using allocation_map_bit_tree_config = bit_tree<UntypedAllocatorT, uint64_t, 
+  4, 4, 4, 4,  4, 4, 4, 4,
+  4, 4, 4, 4,  4, 4, 4, 4>;
+
+template <class UserPayload, class UntypedAllocatorT = stdlib_untyped_allocator>
+class allocation_map : public allocation_map_bit_tree_config<UntypedAllocatorT> {
+public:
+  using bit_tree_t = allocation_map_bit_tree_config<UntypedAllocatorT>;
+
+  static_assert(sizeof(void*) == 8, "Unsupported pointer size");
+  static_assert(std::is_trivial_v<UserPayload>, "UserPayload must be trivial type");
+
+  allocation_map()
+  : _num_in_progress_operations{0} {}
+
+  struct value_type : public UserPayload {
+    std::size_t allocation_size;
+  };
+
+  // Access entry of allocation that address belongs to, or nullptr if the address
+  // does not belong to a known allocation.
+  value_type* get_entry(uint64_t address, uint64_t& root_address) noexcept {
+    insert_or_get_entry_lock lock{_num_in_progress_operations};
+    root_address = 0;
+    int num_leaf_attempts = 0;
+    return get_entry(_root, address, num_leaf_attempts, root_address);
+  }
+
+  // Access entry of allocation that has the given address. Unlike get_entry(),
+  // this does not succeed if the address does not point to the base of the allocation.
+  value_type* get_entry_of_root_address(uint64_t address) noexcept {
+    insert_or_get_entry_lock lock{_num_in_progress_operations};
+    return get_entry_of_root_address(_root, address);
+  }
+
+  // Insert new element. Element's allocation range must be
+  // non-overlapping w.r.t existing entries.
+  // ~0ull is unsupported, because then non-zero allocation
+  // ranges cannot be expressed.
+  bool insert(uint64_t address, const value_type& v) {
+    insert_or_get_entry_lock lock{_num_in_progress_operations};
+    return insert(_root, address, v);
+  }
+
+  bool erase(uint64_t address) {
+    erase_lock lock{_num_in_progress_operations};
+    return erase(_root, address);
+  }
+
+  ~allocation_map() {
+    for (int i = 0;
+         i < this->get_num_entries_in_level(bit_tree_t::root_level_idx); ++i) {
+      auto* ptr = _root.children[i].load(std::memory_order_acquire);
+      if(ptr)
+        release(*ptr);
+    }
+  }
+    
+private:
+  // Useful for debugging/printing
+  template<class F>
+  void with_decomposed_address(uint64_t address, int current_level, F&& handler) {
+    for(int i = this->root_level_idx; i >= current_level; --i) {
+      handler(this->get_index_in_level(address, i));
+    }
+    for(int i = current_level - 1; i >= 0; --i) {
+      handler(-1);
+    }
+  }
+
+  template<class Ostream>
+  void print(Ostream& ostr, uint64_t address, int level) {
+    with_decomposed_address(address, level, [&](int x){
+      if(x >= 0)
+        ostr << x << ".";
+      else
+        ostr << "x";
+    });
+    ostr << "\n";
+  }
+
+  struct leaf_node {
+    leaf_node()
+    : num_entries {} {
+      for(int i = 0; i < bit_tree_t::get_num_entries_in_level(0); ++i) {
+        entries[i].allocation_size = 0;
+      }
+    }
+
+    value_type entries [bit_tree_t::get_num_entries_in_level(0)];
+    std::atomic<int> num_entries;
+  };
+
+  template<int Level>
+  struct intermediate_node {
+  private:
+    static constexpr auto make_child() {
+      if constexpr (Level > 1) return 
+        intermediate_node<Level - 1>{};
+      else return leaf_node{};
+    }
+  public:
+    intermediate_node()
+    : children{}, num_entries{} {}
+
+    using child_type = decltype(make_child());
+
+    std::atomic<child_type*> children [bit_tree_t::get_num_entries_in_level(Level)];
+    std::atomic<int> num_entries;
+  };
+
+  value_type *get_entry(leaf_node &current_node, uint64_t address,
+                        int &/*num_leaf_attempts*/,
+                        uint64_t &root_address) noexcept {
+    int start_address = 0;
+
+    uint64_t max_local_address =
+        root_address | (bit_tree_t::get_num_entries_in_level(0) - 1);
+    
+    if(max_local_address <= address)
+      start_address = bit_tree_t::get_num_entries_in_level(0) - 1;
+    else
+      start_address = bit_tree_t::get_index_in_level(address, 0);
+
+    for (int local_address = start_address; local_address >= 0;
+         --local_address) {
+      
+      auto& element = current_node.entries[local_address];
+
+      std::size_t allocation_size =
+          __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
+      if(allocation_size > 0) {
+
+        uint64_t root_address_candidate =
+            root_address | (static_cast<uint64_t>(local_address)
+                            << bit_tree_t::get_bitoffset_in_level(0));
+
+        uint64_t allocation_end = root_address_candidate + allocation_size;
+        if(address >= root_address_candidate && address < allocation_end) {
+          root_address = root_address_candidate;
+          return &element;
+        } else {
+          return nullptr;
+        }
+        
+      }
+    }
+    return nullptr;
+  }
+
+  template <int Level>
+  value_type *get_entry(intermediate_node<Level> &current_node,
+                        uint64_t address,
+                        int& num_leaf_attempts,
+                        uint64_t& root_address) noexcept {
+    // If the queried address is too close to the next allocation,
+    // it can happen that the search converges on the next allocation.
+    // Therefore, to exclude that case, if a search fails, we also
+    // need to try again with the next allocation before that.
+    // This variable counts how many leaves we have accessed. If it
+    // reaches two, we can abort.
+    if constexpr(Level == bit_tree_t::root_level_idx) {
+      num_leaf_attempts = 0;
+    }
+
+    uint64_t max_local_address =
+        root_address |
+        this->get_n_low_bits_set(bit_tree_t::get_bitoffset_in_level(Level) +
+                                 bit_tree_t::bitsizes[Level]);
+
+    // We are always looking for the next allocation preceding the
+    // current address. If the maximum local address in this node
+    // cannot reach the search address, (e.g. if we are looking in
+    // a preceding node at the same level), we need to start from 
+    // the maximum address. Otherwise, we need to look at the bits
+    // set in this address.
+    int start_address = 0;
+    if(max_local_address <= address)
+      start_address = bit_tree_t::get_num_entries_in_level(Level) - 1;
+    else
+      start_address = bit_tree_t::get_index_in_level(address, Level);
+
+    for (int local_address = start_address;
+         local_address >= 0; --local_address) {
+      
+      auto *ptr = current_node.children[local_address].load(
+          std::memory_order_acquire);
+      
+      if(ptr) {
+        uint64_t root_address_candidate =
+            root_address | (static_cast<uint64_t>(local_address)
+                            << bit_tree_t::get_bitoffset_in_level(Level));
+
+        auto* ret = get_entry(*ptr, address, num_leaf_attempts,
+                              root_address_candidate);
+        // If we are in level 1, ret refers to a leaf node
+        if constexpr(Level == 1) {
+          ++num_leaf_attempts;
+        }
+
+        if(ret) {
+          root_address = root_address_candidate;
+          return ret;
+        } else if(num_leaf_attempts >= 2) {
+          // We can abort if we have looked at the first hit leaf node,
+          // and the one before that.
+          return nullptr;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  value_type *get_entry_of_root_address(leaf_node &current_node, uint64_t address) noexcept {
+    int local_address = bit_tree_t::get_index_in_level(address, 0);
+  
+    auto& element = current_node.entries[local_address];
+    std::size_t allocation_size =
+        __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
+
+    if (allocation_size > 0) {
+      return &element;
+    }
+
+    return nullptr;
+  }
+
+  template <int Level>
+  value_type *get_entry_of_root_address(intermediate_node<Level> &current_node,
+                                        uint64_t address) noexcept {
+    int local_address = bit_tree_t::get_index_in_level(address, Level);
+  
+    auto *ptr = current_node.children[local_address].load(
+          std::memory_order_acquire);
+      
+    if(ptr) {
+      return get_entry_of_root_address(*ptr, address);
+    }
+    return nullptr;
+  }
+
+  bool insert(leaf_node &current_node, uint64_t address, const value_type &v) {
+
+    int local_address = bit_tree_t::get_index_in_level(address, 0);
+
+    std::size_t *allocation_size_ptr =
+        &(current_node.entries[local_address].allocation_size);
+
+    std::size_t allocation_size = __atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE);
+    if(allocation_size > 0) {
+      // Entry is already occupied
+      return false;
+    }
+    
+    __atomic_store_n(allocation_size_ptr, v.allocation_size, __ATOMIC_RELEASE);
+    current_node.entries[local_address].UserPayload::operator=(v);
+    
+    current_node.num_entries.fetch_add(
+        1, std::memory_order_acq_rel);
+
+    return true;
+  }
+
+  template <int Level>
+  bool insert(intermediate_node<Level> &current_node, uint64_t address,
+              const value_type &v) {
+    using child_t = typename intermediate_node<Level>::child_type;
+
+    int local_address = bit_tree_t::get_index_in_level(address, Level);
+    
+    auto *ptr = current_node.children[local_address].load(
+        std::memory_order_acquire);
+    
+    if(!ptr) {
+      child_t* new_child = this->template alloc<child_t>(1);
+      new (new_child) child_t{};
+
+      if (!current_node.children[local_address].compare_exchange_strong(
+              ptr /* == nullptr*/, new_child, std::memory_order_acq_rel)) {
+        // Assigning new child has failed because child is no longer nullptr
+        // -> free new child again
+        destroy(*new_child);
+        this->free(new_child);
+      } else {
+        current_node.num_entries.fetch_add(
+            1, std::memory_order_acq_rel);
+        ptr = new_child;
+      }
+    }
+
+    return insert(*ptr, address, v);
+  }
+
+  bool erase(leaf_node& current_node, uint64_t address) {
+    int local_address = bit_tree_t::get_index_in_level(address, 0);
+
+    std::size_t *allocation_size_ptr =
+        &(current_node.entries[local_address].allocation_size);
+    // Entry was already deleted or does not exist
+    if(__atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE) == 0)
+      return false;
+
+    __atomic_store_n(allocation_size_ptr, 0, __ATOMIC_RELEASE);
+
+    current_node.num_entries.fetch_sub(
+        1, std::memory_order_acq_rel);
+    
+    return true;
+  }
+
+  template <int Level>
+  bool erase(intermediate_node<Level> &current_node, uint64_t address) {
+
+    int local_address = bit_tree_t::get_index_in_level(address, Level);
+    auto *ptr = current_node.children[local_address].load(
+        std::memory_order_acquire);
+    if(!ptr)
+      return false;
+    
+    bool result = erase(*ptr, address);
+    if(result) {
+      if(ptr->num_entries.load(std::memory_order_acquire) == 0) {
+        auto *current_ptr = current_node.children[local_address].exchange(
+            nullptr, std::memory_order_acq_rel);
+        // TODO: We could potentially get erase() lock-free
+        // by counting by how many ops each node is currently used,
+        // and waiting here until the count turns to 0.
+        if(current_ptr) {
+          destroy(*current_ptr);
+          this->free(current_ptr);
+          current_node.num_entries.fetch_sub(
+              1, std::memory_order_acq_rel);
+        }
+      }
+    }
+    return result;
+  }
+
+  void release(leaf_node& current_node) {
+    destroy(current_node);
+  }
+
+  template<int Level>
+  void release(intermediate_node<Level>& current_node) {
+    for(int i = 0; i < bit_tree_t::get_num_entries_in_level(Level); ++i){
+      if (auto *ptr = current_node.children[i].load(
+              std::memory_order_acquire)) {
+        release(*ptr);
+        this->free(ptr);
+      }
+    }
+    destroy(current_node);
+  }
+
+  void destroy(leaf_node& node) {
+    node.~leaf_node();
+  }
+
+  template<int Level>
+  void destroy(intermediate_node<Level>& node) {
+    node.~intermediate_node<Level>();
+  }
+
+  struct erase_lock {
+  public:
+    erase_lock(std::atomic<int>& op_counter)
+    : _op_counter{op_counter} {
+      int expected = 0;
+      while (!_op_counter.compare_exchange_strong(
+          expected, -1, std::memory_order_release, std::memory_order_relaxed)) {
+        expected = 0;
+      }
+    }
+
+    ~erase_lock() {
+      _op_counter.store(0, std::memory_order_release);
+    }
+  private:
+    std::atomic<int>& _op_counter;
+  };
+
+  struct insert_or_get_entry_lock {
+  public:
+    insert_or_get_entry_lock(std::atomic<int>& op_counter)
+    : _op_counter{op_counter} {
+      int expected = std::max(0, _op_counter.load(std::memory_order_acquire));
+      while (!_op_counter.compare_exchange_strong(
+          expected, expected+1, std::memory_order_release,
+          std::memory_order_relaxed)) {
+        if(expected < 0)
+          expected = 0;
+      }
+    }
+
+    ~insert_or_get_entry_lock() {
+      _op_counter.fetch_sub(1, std::memory_order_acq_rel);
+    }
+  private:
+   std::atomic<int>& _op_counter;
+  };
+
+  intermediate_node<bit_tree_t::root_level_idx> _root;
+  std::atomic<int> _num_in_progress_operations;
+};
+
+
+}
+
+#endif
diff --git a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
index 070dc9adc..977fa975c 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
@@ -50,6 +50,7 @@ class LLVMToBackendTranslator {
 
   virtual ~LLVMToBackendTranslator() {}
 
+  void setNoAliasKernelParam(const std::string& KernelName, int ParamIndex);
   void specializeKernelArgument(const std::string &KernelName, int ParamIndex,
                                 const void *ValueBuffer);
   void specializeFunctionCalls(const std::string &FuncName,
@@ -77,7 +78,6 @@ class LLVMToBackendTranslator {
   bool prepareIR(llvm::Module& M);
   bool translatePreparedIR(llvm::Module& FlavoredModule, std::string& out);
 
-
   const std::vector<std::string>& getErrorLog() const {
     return Errors;
   }
@@ -234,6 +234,7 @@ class LLVMToBackendTranslator {
   std::string ErroringCode;
 
   std::vector<std::pair<std::string, std::vector<int>*>> FunctionsForDeadArgumentElimination;
+  std::unordered_map<std::string, std::vector<int>> NoAliasParameters;
 
   // map from kernel name to list of (param index, alignment)
   std::unordered_map<std::string, std::vector<std::pair<int, int>>> KnownPtrParamAlignments;
diff --git a/include/hipSYCL/glue/llvm-sscp/jit.hpp b/include/hipSYCL/glue/llvm-sscp/jit.hpp
index 0e567b47e..2f97dd833 100644
--- a/include/hipSYCL/glue/llvm-sscp/jit.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/jit.hpp
@@ -246,6 +246,12 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
                                           entry.first, &entry.second);
     }
 
+    int num_param_indices = static_cast<int>(config.get_num_kernel_param_indices());
+    for (int i = 0; i < num_param_indices; ++i) {
+      if (config.has_kernel_param_flag(i, rt::kernel_param_flag::noalias)) {
+        translator->setNoAliasKernelParam(translator->getKernels().front(), i);
+      }
+    }
     for(const auto& entry : config.known_alignments()) {
       translator->setKnownPtrParamAlignment(translator->getKernels().front(),
                                             entry.first, entry.second);
diff --git a/include/hipSYCL/runtime/allocation_tracker.hpp b/include/hipSYCL/runtime/allocation_tracker.hpp
new file mode 100644
index 000000000..48a830158
--- /dev/null
+++ b/include/hipSYCL/runtime/allocation_tracker.hpp
@@ -0,0 +1,33 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_ALLOCATION_TRACKER_HPP
+#define ACPP_ALLOCATION_TRACKER_HPP
+
+#include <cstdint>
+#include "runtime_event_handlers.hpp"
+#include "hipSYCL/common/allocation_map.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+class allocation_tracker {
+public:
+  static bool query_allocation(const void *ptr, allocation_info &out,
+                               uint64_t &root_address);
+  static bool register_allocation(const void *ptr, std::size_t size,
+                                  const allocation_info &info);
+  static bool unregister_allocation(const void* ptr);
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/runtime/allocator.hpp b/include/hipSYCL/runtime/allocator.hpp
index 240d07d2a..79375e0b7 100644
--- a/include/hipSYCL/runtime/allocator.hpp
+++ b/include/hipSYCL/runtime/allocator.hpp
@@ -33,14 +33,24 @@ struct pointer_info {
 class backend_allocator
 {
 public:
-  virtual void *allocate(size_t min_alignment, size_t size_bytes) = 0;
-  // Optimized host memory - may be page-locked, device mapped if supported
-  virtual void* allocate_optimized_host(size_t min_alignment, size_t bytes) = 0;
-  virtual void free(void *mem) = 0;
+
+  /// Raw allocation mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void *raw_allocate(size_t min_alignment, size_t size_bytes) = 0;
+  /// Optimized host memory - may be page-locked, device mapped if supported
+  /// Raw mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void* raw_allocate_optimized_host(size_t min_alignment, size_t bytes) = 0;
+  /// Raw free mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void raw_free(void *mem) = 0;
+  /// Allocate memory accessible both from the host and the backend.
+  /// Raw mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void *raw_allocate_usm(size_t bytes) = 0;
   
+  virtual device_id get_device() const = 0;
 
-  /// Allocate memory accessible both from the host and the backend
-  virtual void *allocate_usm(size_t bytes) = 0;
   virtual bool is_usm_accessible_from(backend_descriptor b) const = 0;
 
   // Query the given pointer for its properties. If pointer is unknown,
@@ -53,6 +63,14 @@ class backend_allocator
   virtual ~backend_allocator(){}
 };
 
+void *allocate_device(backend_allocator *alloc, size_t min_alignment,
+                      size_t size_bytes);
+void *allocate_host(backend_allocator *alloc, size_t min_alignment,
+                              size_t bytes);
+void *allocate_shared(backend_allocator* alloc, size_t bytes);
+void deallocate(backend_allocator* alloc, void *mem);
+
+
 }
 }
 
diff --git a/include/hipSYCL/runtime/application.hpp b/include/hipSYCL/runtime/application.hpp
index 4cb86bcc9..7f426ca69 100644
--- a/include/hipSYCL/runtime/application.hpp
+++ b/include/hipSYCL/runtime/application.hpp
@@ -16,6 +16,7 @@
 #include "backend.hpp"
 #include "device_id.hpp"
 #include "settings.hpp"
+#include "runtime_event_handlers.hpp"
 
 namespace hipsycl {
 namespace rt {
@@ -24,6 +25,7 @@ class dag_manager;
 class runtime;
 class async_error_list;
 
+
 class application
 {
 public:
@@ -32,6 +34,7 @@ class application
   // from the runtime or kernel launchers.
   static std::shared_ptr<runtime> get_runtime_pointer();
   static async_error_list& errors();
+  static runtime_event_handlers& event_handler_layer();
 
   application() = delete;
 };
diff --git a/include/hipSYCL/runtime/cuda/cuda_allocator.hpp b/include/hipSYCL/runtime/cuda/cuda_allocator.hpp
index 50e514237..0497f865c 100644
--- a/include/hipSYCL/runtime/cuda/cuda_allocator.hpp
+++ b/include/hipSYCL/runtime/cuda/cuda_allocator.hpp
@@ -21,20 +21,22 @@ class cuda_allocator : public backend_allocator
 public:
   cuda_allocator(backend_descriptor desc, int cuda_device);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
-                                        size_t bytes) override;
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
+                                            size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   backend_descriptor _backend_descriptor;
   int _dev;
diff --git a/include/hipSYCL/runtime/data.hpp b/include/hipSYCL/runtime/data.hpp
index c80bca32e..6879e5016 100644
--- a/include/hipSYCL/runtime/data.hpp
+++ b/include/hipSYCL/runtime/data.hpp
@@ -417,7 +417,7 @@ class data_region
                  "a memory leak."
               << std::endl;
         } else {
-          alloc.managing_allocator->free(alloc.memory);
+          rt::deallocate(alloc.managing_allocator, alloc.memory);
         }
       }
       return true;
diff --git a/include/hipSYCL/runtime/hip/hip_allocator.hpp b/include/hipSYCL/runtime/hip/hip_allocator.hpp
index 7956e80f4..89b8c4a7d 100644
--- a/include/hipSYCL/runtime/hip/hip_allocator.hpp
+++ b/include/hipSYCL/runtime/hip/hip_allocator.hpp
@@ -21,20 +21,22 @@ class hip_allocator : public backend_allocator
 public:
   hip_allocator(backend_descriptor desc, int hip_device);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
-                                        size_t bytes) override;
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
+                                           size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   backend_descriptor _backend_descriptor;
   int _dev;
diff --git a/include/hipSYCL/runtime/kernel_cache.hpp b/include/hipSYCL/runtime/kernel_cache.hpp
index 3c6e7904a..4143ed16a 100644
--- a/include/hipSYCL/runtime/kernel_cache.hpp
+++ b/include/hipSYCL/runtime/kernel_cache.hpp
@@ -109,7 +109,8 @@ class hcf_kernel_info {
 
   enum annotation_type {
     specialized,
-    fcall_specialized_config
+    fcall_specialized_config,
+    noalias
   };
 
   std::size_t get_argument_offset(std::size_t i) const;
diff --git a/include/hipSYCL/runtime/kernel_configuration.hpp b/include/hipSYCL/runtime/kernel_configuration.hpp
index de8ab2cdb..4bfc11932 100644
--- a/include/hipSYCL/runtime/kernel_configuration.hpp
+++ b/include/hipSYCL/runtime/kernel_configuration.hpp
@@ -69,6 +69,12 @@ enum class kernel_build_flag : int {
   spirv_enable_intel_llvm_spirv_options
 };
 
+enum class kernel_param_flag : int {
+  // these values are used as bit masks and should
+  // always have a value of a power of 2
+  noalias = 1
+};
+
 
 
 std::string to_string(kernel_build_flag f);
@@ -102,6 +108,12 @@ class kernel_configuration {
         std::make_pair(param_index, buffer_value));
   }
 
+  void set_kernel_param_flag(int param_index, kernel_param_flag flag) {
+    if(_kernel_param_flags.size() <= param_index)
+      _kernel_param_flags.resize(param_index+1, 0);
+    _kernel_param_flags[param_index] |= static_cast<uint64_t>(flag);
+  }
+
   void set_function_call_specialization_config(
       int param_index, glue::sscp::fcall_config_kernel_property_t config) {
     _function_call_specializations.push_back(config);
@@ -191,6 +203,15 @@ class kernel_configuration {
                         &config_id, sizeof(config_id));
     }
 
+    for(int i = 0; i < _kernel_param_flags.size(); ++i) {
+      if(_kernel_param_flags[i] != 0) {
+        auto flags = _kernel_param_flags[i];
+        uint64_t numeric_option_id = static_cast<uint64_t>(i) | (1ull << 36);
+        add_entry_to_hash(result, &numeric_option_id, sizeof(numeric_option_id),
+                          &flags, sizeof(flags));
+      }
+    }
+    
     for(const auto& entry : _known_alignments) {
       uint64_t numeric_option_id = static_cast<uint64_t>(entry.first) | (1ull << 37);
       uint64_t config_id = entry.second;
@@ -217,6 +238,18 @@ class kernel_configuration {
     return _function_call_specializations;
   }
 
+  bool has_kernel_param_flag(int param_index, kernel_param_flag flag) const {
+    if(param_index < _kernel_param_flags.size()) {
+      return _kernel_param_flags[param_index] & static_cast<uint64_t>(flag);
+    }
+
+    return false;
+  }
+
+  std::size_t get_num_kernel_param_indices() const {
+    return _kernel_param_flags.size();
+  }
+  
   const auto& known_alignments() const {
     return _known_alignments;
   }
@@ -281,6 +314,7 @@ class kernel_configuration {
   std::vector<std::pair<int, uint64_t>> _specialized_kernel_args;
   std::vector<glue::sscp::fcall_config_kernel_property_t>
       _function_call_specializations;
+  std::vector<uint64_t> _kernel_param_flags;
   std::vector<std::pair<int, int>> _known_alignments;
 
   id_type _base_configuration_result = {};
diff --git a/include/hipSYCL/runtime/ocl/ocl_allocator.hpp b/include/hipSYCL/runtime/ocl/ocl_allocator.hpp
index 85ab58748..4c45e5daf 100644
--- a/include/hipSYCL/runtime/ocl/ocl_allocator.hpp
+++ b/include/hipSYCL/runtime/ocl/ocl_allocator.hpp
@@ -23,16 +23,16 @@ class ocl_allocator : public backend_allocator
 {
 public:
   ocl_allocator() = default;
-  ocl_allocator(ocl_usm* usm_provier);
+  ocl_allocator(rt::device_id dev, ocl_usm* usm_provier);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
                                         size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
@@ -40,8 +40,10 @@ class ocl_allocator : public backend_allocator
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
 
+  virtual device_id get_device() const override;
 private:
   ocl_usm* _usm;
+  rt::device_id _dev;
 };
 
 }
diff --git a/include/hipSYCL/runtime/omp/omp_allocator.hpp b/include/hipSYCL/runtime/omp/omp_allocator.hpp
index d74375ca5..8487a8c02 100644
--- a/include/hipSYCL/runtime/omp/omp_allocator.hpp
+++ b/include/hipSYCL/runtime/omp/omp_allocator.hpp
@@ -21,14 +21,14 @@ class omp_allocator : public backend_allocator
 public:
   omp_allocator(const device_id &my_device);
   
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
                                         size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void *ptr,
@@ -36,10 +36,14 @@ class omp_allocator : public backend_allocator
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   device_id _my_device;
 };
 
+
+
 }
 }
 
diff --git a/include/hipSYCL/runtime/runtime_event_handlers.hpp b/include/hipSYCL/runtime/runtime_event_handlers.hpp
new file mode 100644
index 000000000..9b841cbd1
--- /dev/null
+++ b/include/hipSYCL/runtime/runtime_event_handlers.hpp
@@ -0,0 +1,48 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_RT_EVENT_HANDLERS_HPP
+#define ACPP_RT_EVENT_HANDLERS_HPP
+
+#include <memory>
+
+#include "backend.hpp"
+#include "device_id.hpp"
+#include "settings.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+struct allocation_info {
+  enum class allocation_type {
+    device,
+    shared,
+    host
+  };
+
+  rt::device_id dev;
+  allocation_type alloc_type;
+};
+
+class runtime_event_handlers {
+public:
+  runtime_event_handlers();
+  void on_new_allocation(const void*, std::size_t, const allocation_info& info);
+  void on_deallocation(const void* ptr);
+private:
+  bool _needs_allocation_tracking;
+};
+
+
+}
+}
+
+
+#endif
diff --git a/include/hipSYCL/runtime/settings.hpp b/include/hipSYCL/runtime/settings.hpp
index 83040837a..82e7a2ad9 100644
--- a/include/hipSYCL/runtime/settings.hpp
+++ b/include/hipSYCL/runtime/settings.hpp
@@ -107,7 +107,8 @@ enum class setting {
   adaptivity_level,
   jitopt_iads_relative_threshold,
   jitopt_iads_relative_eviction_threshold,
-  jitopt_iads_relative_threshold_min_data
+  jitopt_iads_relative_threshold_min_data,
+  enable_allocation_tracking
 };
 
 template <setting S> struct setting_trait {};
@@ -146,6 +147,7 @@ HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::jitopt_iads_relative_eviction_threshold,
 HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::jitopt_iads_relative_threshold_min_data,
                               "jitopt_iads_relative_threshold_min_data",
                               std::size_t)
+HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::enable_allocation_tracking, "enable_allocation_tracking", bool)
 
 class settings
 {
@@ -190,6 +192,8 @@ class settings
       return _jitopt_iads_relative_threshold_min_data;
     } else if constexpr(S == setting::jitopt_iads_relative_eviction_threshold) {
       return _jitopt_iads_relative_eviction_threshold;
+    } else if constexpr(S == setting::enable_allocation_tracking) {
+      return _enable_allocation_tracking;
     }
     return typename setting_trait<S>::type{};
   }
@@ -242,6 +246,8 @@ class settings
         get_environment_variable_or_default<setting::jitopt_iads_relative_eviction_threshold>(0.1);
     _jitopt_iads_relative_threshold_min_data =
         get_environment_variable_or_default<setting::jitopt_iads_relative_threshold_min_data>(1024);
+    _enable_allocation_tracking =
+        get_environment_variable_or_default<setting::enable_allocation_tracking>(false);
   }
 
 private:
@@ -273,6 +279,7 @@ class settings
   double _jitopt_iads_relative_threshold;
   double _jitopt_iads_relative_eviction_threshold;
   std::size_t _jitopt_iads_relative_threshold_min_data;
+  bool _enable_allocation_tracking;
 };
 
 }
diff --git a/include/hipSYCL/runtime/ze/ze_allocator.hpp b/include/hipSYCL/runtime/ze/ze_allocator.hpp
index eee8f2f69..adee72158 100644
--- a/include/hipSYCL/runtime/ze/ze_allocator.hpp
+++ b/include/hipSYCL/runtime/ze/ze_allocator.hpp
@@ -12,6 +12,7 @@
 #define HIPSYCL_ZE_ALLOCATOR_HPP
 
 #include "../allocator.hpp"
+#include "hipSYCL/runtime/device_id.hpp"
 #include "ze_hardware_manager.hpp"
 
 #include <level_zero/ze_api.h>
@@ -22,26 +23,30 @@ namespace rt {
 class ze_allocator : public backend_allocator 
 {
 public:
-  ze_allocator(const ze_hardware_context* dev, const ze_hardware_manager* hw_manager);
+  ze_allocator(std::size_t device_index, const ze_hardware_context *dev,
+               const ze_hardware_manager *hw_manager);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
-                                        size_t bytes) override;
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
+                                            size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   ze_context_handle_t _ctx;
   ze_device_handle_t _dev;
   uint32_t _global_mem_ordinal;
+  device_id _dev_id;
 
   const ze_hardware_manager* _hw_manager;
 };
diff --git a/include/hipSYCL/std/stdpar/detail/allocation_map.hpp b/include/hipSYCL/std/stdpar/detail/allocation_map.hpp
index 93ce420d3..e24d0ff42 100644
--- a/include/hipSYCL/std/stdpar/detail/allocation_map.hpp
+++ b/include/hipSYCL/std/stdpar/detail/allocation_map.hpp
@@ -8,8 +8,8 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#ifndef HIPSYCL_ALLOCATION_MAP_HPP
-#define HIPSYCL_ALLOCATION_MAP_HPP
+#ifndef ACPP_STDPAR_ALLOCATION_DATA_STRUCTURES_HPP
+#define ACPP_STDPAR_ALLOCATION_DATA_STRUCTURES_HPP
 
 
 #include <cstddef>
@@ -22,468 +22,14 @@
 #include <array>
 #include <cassert>
 
+#include "hipSYCL/common/allocation_map.hpp"
+
 
 extern "C" void *__libc_malloc(size_t);
 extern "C" void __libc_free(void*);
 
 namespace hipsycl::stdpar {
 
-struct default_allocation_map_payload {};
-
-template<class Int_type, int... Bit_sizes>
-class bit_tree {
-protected:
-  
-  static constexpr int num_levels = sizeof...(Bit_sizes);
-  static constexpr int root_level_idx = num_levels - 1;
-  static constexpr int bitsizes[num_levels] = {Bit_sizes...};
-
-  static constexpr int get_num_entries_in_level(int level) {
-    return 1ull << bitsizes[level];
-  }
-
-  static constexpr int get_bitoffset_in_level(int level) {
-    int result = 0;
-    for(int i = 0; i < level; ++i) {
-      result += bitsizes[i];
-    }
-    return result;
-  }
-
-  static constexpr int get_index_in_level(Int_type address, int level) {
-    Int_type bitmask = get_n_low_bits_set(bitsizes[level]);
-    return (address >> get_bitoffset_in_level(level)) & bitmask;
-  }
-
-  static constexpr uint64_t get_n_low_bits_set(int n) {
-    if(n == 64)
-      return ~0ull;
-    return (1ull << n) - 1;
-  }
-
-  static constexpr uint64_t get_space_spanned_by_node_in_level(int level) {
-    uint64_t result = 1;
-    for(int i = 0; i < level; ++i)
-      result *= get_num_entries_in_level(level);
-    return result;
-  }
-
-  template<class T>
-  static T* alloc(int count) {
-    return static_cast<T*>(__libc_malloc(sizeof(T) * count));
-  }
-
-  static void free(void* ptr) {
-    __libc_free(ptr);
-  }
-};
-
-template <class UserPayload = default_allocation_map_payload>
-class allocation_map : public bit_tree<uint64_t, 
-  4, 4, 4, 4,  4, 4, 4, 4,
-  4, 4, 4, 4,  4, 4, 4, 4> {
-public:
-  static_assert(sizeof(void*) == 8, "Unsupported pointer size");
-  static_assert(std::is_trivial_v<UserPayload>, "UserPayload must be trivial type");
-
-  allocation_map()
-  : _num_in_progress_operations{0} {}
-
-  struct value_type : public UserPayload {
-    std::size_t allocation_size;
-  };
-
-  // Access entry of allocation that address belongs to, or nullptr if the address
-  // does not belong to a known allocation.
-  value_type* get_entry(uint64_t address, uint64_t& root_address) noexcept {
-    insert_or_get_entry_lock lock{_num_in_progress_operations};
-    root_address = 0;
-    int num_leaf_attempts = 0;
-    return get_entry(_root, address, num_leaf_attempts, root_address);
-  }
-
-  // Access entry of allocation that has the given address. Unlike get_entry(),
-  // this does not succeed if the address does not point to the base of the allocation.
-  value_type* get_entry_of_root_address(uint64_t address) noexcept {
-    insert_or_get_entry_lock lock{_num_in_progress_operations};
-    return get_entry_of_root_address(_root, address);
-  }
-
-  // Insert new element. Element's allocation range must be
-  // non-overlapping w.r.t existing entries.
-  // ~0ull is unsupported, because then non-zero allocation
-  // ranges cannot be expressed.
-  bool insert(uint64_t address, const value_type& v) {
-    insert_or_get_entry_lock lock{_num_in_progress_operations};
-    return insert(_root, address, v);
-  }
-
-  bool erase(uint64_t address) {
-    erase_lock lock{_num_in_progress_operations};
-    return erase(_root, address);
-  }
-
-  ~allocation_map() {
-    for(int i = 0; i < get_num_entries_in_level(root_level_idx); ++i) {
-      auto* ptr = _root.children[i].load(std::memory_order_acquire);
-      if(ptr)
-        release(*ptr);
-    }
-  }
-    
-private:
-  // Useful for debugging/printing
-  template<class F>
-  void with_decomposed_address(uint64_t address, int current_level, F&& handler) {
-    for(int i = root_level_idx; i >= current_level; --i) {
-      handler(get_index_in_level(address, i));
-    }
-    for(int i = current_level - 1; i >= 0; --i) {
-      handler(-1);
-    }
-  }
-
-  template<class Ostream>
-  void print(Ostream& ostr, uint64_t address, int level) {
-    with_decomposed_address(address, level, [&](int x){
-      if(x >= 0)
-        ostr << x << ".";
-      else
-        ostr << "x";
-    });
-    ostr << "\n";
-  }
-
-  struct leaf_node {
-    leaf_node()
-    : num_entries {} {
-      for(int i = 0; i < get_num_entries_in_level(0); ++i) {
-        entries[i].allocation_size = 0;
-      }
-    }
-
-    value_type entries [get_num_entries_in_level(0)];
-    std::atomic<int> num_entries;
-  };
-
-  template<int Level>
-  struct intermediate_node {
-  private:
-    static constexpr auto make_child() {
-      if constexpr (Level > 1) return 
-        intermediate_node<Level - 1>{};
-      else return leaf_node{};
-    }
-  public:
-    intermediate_node()
-    : children{}, num_entries{} {}
-
-    using child_type = decltype(make_child());
-
-    std::atomic<child_type*> children [get_num_entries_in_level(Level)];
-    std::atomic<int> num_entries;
-  };
-
-  value_type *get_entry(leaf_node &current_node, uint64_t address,
-                        int &/*num_leaf_attempts*/,
-                        uint64_t &root_address) noexcept {
-    int start_address = 0;
-
-    uint64_t max_local_address =
-        root_address | (get_num_entries_in_level(0) - 1);
-    
-    if(max_local_address <= address)
-      start_address = get_num_entries_in_level(0) - 1;
-    else
-      start_address = get_index_in_level(address, 0);
-
-    for (int local_address = start_address; local_address >= 0;
-         --local_address) {
-      
-      auto& element = current_node.entries[local_address];
-
-      std::size_t allocation_size =
-          __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
-      if(allocation_size > 0) {
-
-        uint64_t root_address_candidate =
-            root_address |
-            (static_cast<uint64_t>(local_address) << get_bitoffset_in_level(0));
-
-        uint64_t allocation_end = root_address_candidate + allocation_size;
-        if(address >= root_address_candidate && address < allocation_end) {
-          root_address = root_address_candidate;
-          return &element;
-        } else {
-          return nullptr;
-        }
-        
-      }
-    }
-    return nullptr;
-  }
-
-  template <int Level>
-  value_type *get_entry(intermediate_node<Level> &current_node,
-                        uint64_t address,
-                        int& num_leaf_attempts,
-                        uint64_t& root_address) noexcept {
-    // If the queried address is too close to the next allocation,
-    // it can happen that the search converges on the next allocation.
-    // Therefore, to exclude that case, if a search fails, we also
-    // need to try again with the next allocation before that.
-    // This variable counts how many leaves we have accessed. If it
-    // reaches two, we can abort.
-    if constexpr(Level == root_level_idx) {
-      num_leaf_attempts = 0;
-    }
-
-    uint64_t max_local_address =
-        root_address |
-        get_n_low_bits_set(get_bitoffset_in_level(Level) + bitsizes[Level]);
-
-    // We are always looking for the next allocation preceding the
-    // current address. If the maximum local address in this node
-    // cannot reach the search address, (e.g. if we are looking in
-    // a preceding node at the same level), we need to start from 
-    // the maximum address. Otherwise, we need to look at the bits
-    // set in this address.
-    int start_address = 0;
-    if(max_local_address <= address)
-      start_address = get_num_entries_in_level(Level) - 1;
-    else
-      start_address = get_index_in_level(address, Level);
-
-    for (int local_address = start_address;
-         local_address >= 0; --local_address) {
-      
-      auto *ptr = current_node.children[local_address].load(
-          std::memory_order_acquire);
-      
-      if(ptr) {
-        uint64_t root_address_candidate =
-            root_address | (static_cast<uint64_t>(local_address)
-                            << get_bitoffset_in_level(Level));
-
-        auto* ret = get_entry(*ptr, address, num_leaf_attempts,
-                              root_address_candidate);
-        // If we are in level 1, ret refers to a leaf node
-        if constexpr(Level == 1) {
-          ++num_leaf_attempts;
-        }
-
-        if(ret) {
-          root_address = root_address_candidate;
-          return ret;
-        } else if(num_leaf_attempts >= 2) {
-          // We can abort if we have looked at the first hit leaf node,
-          // and the one before that.
-          return nullptr;
-        }
-      }
-    }
-    return nullptr;
-  }
-
-  value_type *get_entry_of_root_address(leaf_node &current_node, uint64_t address) noexcept {
-    int local_address = get_index_in_level(address, 0);
-  
-    auto& element = current_node.entries[local_address];
-    std::size_t allocation_size =
-        __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
-
-    if (allocation_size > 0) {
-      return &element;
-    }
-
-    return nullptr;
-  }
-
-  template <int Level>
-  value_type *get_entry_of_root_address(intermediate_node<Level> &current_node,
-                                        uint64_t address) noexcept {
-    int local_address = get_index_in_level(address, Level);
-  
-    auto *ptr = current_node.children[local_address].load(
-          std::memory_order_acquire);
-      
-    if(ptr) {
-      return get_entry_of_root_address(*ptr, address);
-    }
-    return nullptr;
-  }
-
-  bool insert(leaf_node &current_node, uint64_t address, const value_type &v) {
-
-    int local_address = get_index_in_level(address, 0);
-
-    std::size_t *allocation_size_ptr =
-        &(current_node.entries[local_address].allocation_size);
-
-    std::size_t allocation_size = __atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE);
-    if(allocation_size > 0) {
-      // Entry is already occupied
-      return false;
-    }
-    
-    __atomic_store_n(allocation_size_ptr, v.allocation_size, __ATOMIC_RELEASE);
-    current_node.entries[local_address].UserPayload::operator=(v);
-    
-    current_node.num_entries.fetch_add(
-        1, std::memory_order_acq_rel);
-
-    return true;
-  }
-
-  template <int Level>
-  bool insert(intermediate_node<Level> &current_node, uint64_t address,
-              const value_type &v) {
-    using child_t = typename intermediate_node<Level>::child_type;
-
-    int local_address = get_index_in_level(address, Level);
-    
-    auto *ptr = current_node.children[local_address].load(
-        std::memory_order_acquire);
-    
-    if(!ptr) {
-      child_t* new_child = alloc<child_t>(1);
-      new (new_child) child_t{};
-
-      if (!current_node.children[local_address].compare_exchange_strong(
-              ptr /* == nullptr*/, new_child, std::memory_order_acq_rel)) {
-        // Assigning new child has failed because child is no longer nullptr
-        // -> free new child again
-        destroy(*new_child);
-        this->free(new_child);
-      } else {
-        current_node.num_entries.fetch_add(
-            1, std::memory_order_acq_rel);
-        ptr = new_child;
-      }
-    }
-
-    return insert(*ptr, address, v);
-  }
-
-  bool erase(leaf_node& current_node, uint64_t address) {
-    int local_address = get_index_in_level(address, 0);
-
-    std::size_t *allocation_size_ptr =
-        &(current_node.entries[local_address].allocation_size);
-    // Entry was already deleted or does not exist
-    if(__atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE) == 0)
-      return false;
-
-    __atomic_store_n(allocation_size_ptr, 0, __ATOMIC_RELEASE);
-
-    current_node.num_entries.fetch_sub(
-        1, std::memory_order_acq_rel);
-    
-    return true;
-  }
-
-  template <int Level>
-  bool erase(intermediate_node<Level> &current_node, uint64_t address) {
-
-    int local_address = get_index_in_level(address, Level);
-    auto *ptr = current_node.children[local_address].load(
-        std::memory_order_acquire);
-    if(!ptr)
-      return false;
-    
-    bool result = erase(*ptr, address);
-    if(result) {
-      if(ptr->num_entries.load(std::memory_order_acquire) == 0) {
-        auto *current_ptr = current_node.children[local_address].exchange(
-            nullptr, std::memory_order_acq_rel);
-        // TODO: We could potentially get erase() lock-free
-        // by counting by how many ops each node is currently used,
-        // and waiting here until the count turns to 0.
-        if(current_ptr) {
-          destroy(*current_ptr);
-          this->free(current_ptr);
-          current_node.num_entries.fetch_sub(
-              1, std::memory_order_acq_rel);
-        }
-      }
-    }
-    return result;
-  }
-
-  void release(leaf_node& current_node) {
-    destroy(current_node);
-  }
-
-  template<int Level>
-  void release(intermediate_node<Level>& current_node) {
-    for(int i = 0; i < get_num_entries_in_level(Level); ++i){
-      if (auto *ptr = current_node.children[i].load(
-              std::memory_order_acquire)) {
-        release(*ptr);
-        this->free(ptr);
-      }
-    }
-    destroy(current_node);
-  }
-
-  void destroy(leaf_node& node) {
-    node.~leaf_node();
-  }
-
-  template<int Level>
-  void destroy(intermediate_node<Level>& node) {
-    node.~intermediate_node<Level>();
-  }
-
-  struct erase_lock {
-  public:
-    erase_lock(std::atomic<int>& op_counter)
-    : _op_counter{op_counter} {
-      int expected = 0;
-      while (!_op_counter.compare_exchange_strong(
-          expected, -1, std::memory_order_release, std::memory_order_relaxed)) {
-        expected = 0;
-      }
-    }
-
-    ~erase_lock() {
-      _op_counter.store(0, std::memory_order_release);
-    }
-  private:
-    std::atomic<int>& _op_counter;
-  };
-
-  struct insert_or_get_entry_lock {
-  public:
-    insert_or_get_entry_lock(std::atomic<int>& op_counter)
-    : _op_counter{op_counter} {
-      int expected = std::max(0, _op_counter.load(std::memory_order_acquire));
-      while (!_op_counter.compare_exchange_strong(
-          expected, expected+1, std::memory_order_release,
-          std::memory_order_relaxed)) {
-        if(expected < 0)
-          expected = 0;
-      }
-    }
-
-    ~insert_or_get_entry_lock() {
-      _op_counter.fetch_sub(1, std::memory_order_acq_rel);
-    }
-  private:
-   std::atomic<int>& _op_counter;
-  };
-
-  intermediate_node<root_level_idx> _root;
-  std::atomic<int> _num_in_progress_operations;
-};
-
-
-
-
-
-
-
-
 template <class T>
 class libc_allocator{
 public:
@@ -505,6 +51,17 @@ class libc_allocator{
   }
 };
 
+struct libc_untyped_allocator {
+  static void* allocate(size_t n) {
+    return __libc_malloc(n);
+  }
+
+  static void deallocate(void* ptr) {
+    __libc_free(ptr);
+  }
+};
+
+
 template <class T, class U>
 bool operator==(libc_allocator<T> const &, libc_allocator<U> const &) noexcept {
   return true;
@@ -515,6 +72,10 @@ bool operator!=(libc_allocator<T> const &x,
   return !(x == y);
 }
 
+template <class Payload>
+using allocation_map =
+    common::allocation_map<Payload, libc_untyped_allocator>;
+
 class free_space_map {
 public:
   free_space_map(std::size_t max_assignable_space)
diff --git a/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp b/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp
index aefc0db98..a74af9f36 100644
--- a/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp
+++ b/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp
@@ -32,6 +32,7 @@
 
 
 #include "allocation_map.hpp"
+#include "hipSYCL/runtime/application.hpp"
 #include "offload_heuristic_db.hpp"
 #include "hipSYCL/runtime/settings.hpp"
 #include "hipSYCL/sycl/info/device.hpp"
@@ -252,6 +253,15 @@ class memory_pool {
       assert(is_from_pool(ptr));
       assert(is_from_pool((char*)ptr+size));
       assert((uint64_t)ptr % _page_size == 0);
+
+      // Inform the runtime that there is a new user allocation
+      // by invoking the runtime hook. We need to do this manually
+      // because memory pool directly uses raw backend allocation commands.
+      rt::application::event_handler_layer().on_new_allocation(
+          ptr, size,
+          rt::allocation_info{_dev,
+                              rt::allocation_info::allocation_type::shared});
+
       return ptr;
     }
 
@@ -262,14 +272,14 @@ class memory_pool {
     if(_pool && is_from_pool(ptr)) {
       uint64_t address = reinterpret_cast<uint64_t>(ptr)-reinterpret_cast<uint64_t>(_base_address);
       _free_space_map.release(address, size);
+
+      rt::application::event_handler_layer().on_deallocation(ptr);
     }
   }
 
   ~memory_pool() {
     // Memory pool might be destroyed after runtime shutdown, so rely on OS
     // to clean up for now
-    //if(_pool)
-    //  sycl::free(_pool, detail::single_device_dispatch::get_queue());
   }
 
   std::size_t get_size() const {
@@ -285,13 +295,25 @@ class memory_pool {
   }
 private:
 
+  void* raw_malloc_shared(std::size_t bytes, sycl::queue& q) {
+    auto *allocator = sycl::detail::select_usm_allocator(q.get_context(),
+                                                         q.get_device());
+    return allocator->raw_allocate_usm(bytes);
+  }
+
   void init() {
     HIPSYCL_DEBUG_INFO << "[stdpar] Building a memory pool of size "
                        << static_cast<double>(_pool_size) / (1024 * 1024 * 1024)
                        << " GB" << std::endl;
+    auto& q = detail::single_device_dispatch::get_queue();
+    _dev = q.get_device().AdaptiveCpp_device_id();
+
+    // We need to call raw_allocate_usm so that we can inform the runtime's allocation
+    // tracking mechanism of actual user allocations, not just of the memory pool as a 
+    // whole.
     // Make sure to allocate an additional page so that we can fix alignment if needed
-    _pool = sycl::malloc_shared(
-        _pool_size + _page_size, detail::single_device_dispatch::get_queue());
+    _pool = raw_malloc_shared(_pool_size + _page_size, q);
+
     uint64_t aligned_pool_base = next_multiple_of((uint64_t)_pool, _page_size);
     _base_address = (void*)aligned_pool_base;
     assert(aligned_pool_base % _page_size == 0);
@@ -303,6 +325,7 @@ class memory_pool {
   void* _base_address;
   free_space_map _free_space_map;
   std::size_t _page_size;
+  rt::device_id _dev;
 };
 
 class unified_shared_memory {
diff --git a/include/hipSYCL/sycl/buffer.hpp b/include/hipSYCL/sycl/buffer.hpp
index 51c8679db..91f027b2e 100644
--- a/include/hipSYCL/sycl/buffer.hpp
+++ b/include/hipSYCL/sycl/buffer.hpp
@@ -1210,17 +1210,18 @@ class buffer : public detail::property_carrying_object
     if(!_impl->data->has_allocation(host_device)){
       if(this->has_property<property::buffer::use_optimized_host_memory>()){
         // TODO: Actually may need to use non-host backend here...
-        host_ptr =
-            rt->backends().get(host_device.get_backend())
-                ->get_allocator(host_device)
-                ->allocate_optimized_host(
-                    alignof(T), _impl->data->get_num_elements().size() * sizeof(T));
+        auto* allocator = rt->backends().get(host_device.get_backend())
+                ->get_allocator(host_device);
+        host_ptr = rt::allocate_host(allocator, alignof(T),
+                                     _impl->data->get_num_elements().size() *
+                                         sizeof(T));
       } else {
-        host_ptr =
-            rt->backends().get(host_device.get_backend())
-                ->get_allocator(host_device)
-                ->allocate(
-                    alignof(T), _impl->data->get_num_elements().size() * sizeof(T));
+        auto *allocator = rt->backends()
+                              .get(host_device.get_backend())
+                              ->get_allocator(host_device);
+        host_ptr = rt::allocate_device(allocator, alignof(T),
+                                       _impl->data->get_num_elements().size() *
+                                           sizeof(T));
       }
 
       if(!host_ptr)
diff --git a/include/hipSYCL/sycl/extensions.hpp b/include/hipSYCL/sycl/extensions.hpp
index 57cbc3c1b..5456a4b43 100644
--- a/include/hipSYCL/sycl/extensions.hpp
+++ b/include/hipSYCL/sycl/extensions.hpp
@@ -74,6 +74,7 @@
 #define ACPP_EXT_QUEUE_PRIORITY
 #define ACPP_EXT_SPECIALIZED
 #define ACPP_EXT_DYNAMIC_FUNCTIONS
+#define ACPP_EXT_RESTRICT_PTR
 #define ACPP_EXT_JIT_COMPILE_IF
 
 // KHR extensions
diff --git a/include/hipSYCL/sycl/libkernel/restrict.hpp b/include/hipSYCL/sycl/libkernel/restrict.hpp
new file mode 100644
index 000000000..d92011931
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/restrict.hpp
@@ -0,0 +1,69 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_RESTRICT_HPP
+#define ACPP_RESTRICT_HPP
+
+#include <type_traits>
+
+namespace hipsycl {
+namespace sycl {
+
+namespace detail {
+
+template <class T>
+struct __acpp_sscp_emit_param_type_annotation_restrict {
+  T value;
+};
+
+} // namespace detail
+
+template <class T> class AdaptiveCpp_restrict_ptr {
+public:
+  template <typename U = T, typename = std::enable_if_t<
+                                std::is_default_constructible<U>::value>>
+  AdaptiveCpp_restrict_ptr() : _value{} {}
+
+  AdaptiveCpp_restrict_ptr(const T &value) : _value{value} {}
+
+  AdaptiveCpp_restrict_ptr(const AdaptiveCpp_restrict_ptr<T> &other)
+      : _value{other._value.value} {}
+
+  AdaptiveCpp_restrict_ptr(sycl::AdaptiveCpp_restrict_ptr<T> &&other) {
+    swap(*this, other);
+  }
+
+  AdaptiveCpp_restrict_ptr<T> &operator=(const T &value) {
+    AdaptiveCpp_restrict_ptr<T> tmp{value};
+    swap(*this, tmp);
+    return *this;
+  }
+
+  AdaptiveCpp_restrict_ptr<T> &operator=(AdaptiveCpp_restrict_ptr<T> other) {
+    swap(*this, other);
+    return *this;
+  }
+
+  friend void swap(AdaptiveCpp_restrict_ptr<T> &first,
+                   AdaptiveCpp_restrict_ptr<T> &second) {
+    using std::swap;
+    swap(first._value.value, second._value.value);
+  }
+
+  operator T *() const { return _value.value; }
+
+private:
+  detail::__acpp_sscp_emit_param_type_annotation_restrict<T *> _value;
+};
+
+} // namespace sycl
+} // namespace hipsycl
+
+#endif
diff --git a/include/hipSYCL/sycl/usm.hpp b/include/hipSYCL/sycl/usm.hpp
index 350869648..26bf67655 100644
--- a/include/hipSYCL/sycl/usm.hpp
+++ b/include/hipSYCL/sycl/usm.hpp
@@ -35,7 +35,8 @@ namespace sycl {
 
 inline void *malloc_device(size_t num_bytes, const device &dev,
                            const context &ctx) {
-  return detail::select_device_allocator(dev)->allocate(0, num_bytes);
+  return rt::allocate_device(detail::select_device_allocator(dev), 0,
+                             num_bytes);
 }
 
 template <typename T>
@@ -55,7 +56,7 @@ T* malloc_device(std::size_t count, const queue &q) {
 
 inline void *aligned_alloc_device(std::size_t alignment, std::size_t num_bytes,
                                   const device &dev, const context &ctx) {
-  return detail::select_device_allocator(dev)->allocate(alignment, num_bytes);
+  return rt::allocate_device(detail::select_device_allocator(dev), alignment, num_bytes);
 }
 
 template <typename T>
@@ -79,7 +80,7 @@ T *aligned_alloc_device(std::size_t alignment, std::size_t count,
 // Restricted USM
 
 inline void *malloc_host(std::size_t num_bytes, const context &ctx) {
-  return detail::select_usm_allocator(ctx)->allocate_optimized_host(0, num_bytes);
+  return rt::allocate_host(detail::select_usm_allocator(ctx), 0, num_bytes);
 }
 
 template <typename T> T *malloc_host(std::size_t count, const context &ctx) {
@@ -96,7 +97,7 @@ template <typename T> T *malloc_host(std::size_t count, const queue &q) {
 
 inline void *malloc_shared(std::size_t num_bytes, const device &dev,
                            const context &ctx) {
-  return detail::select_usm_allocator(ctx, dev)->allocate_usm(num_bytes);
+  return rt::allocate_shared(detail::select_usm_allocator(ctx, dev), num_bytes);
 }
 
 template <typename T>
@@ -114,8 +115,8 @@ template <typename T> T *malloc_shared(std::size_t count, const queue &q) {
 
 inline void *aligned_alloc_host(std::size_t alignment, std::size_t num_bytes,
                                 const context &ctx) {
-  return detail::select_usm_allocator(ctx)->allocate_optimized_host(alignment,
-                                                                    num_bytes);
+  return rt::allocate_host(detail::select_usm_allocator(ctx), alignment,
+                           num_bytes);
 }
 
 template <typename T>
@@ -137,7 +138,7 @@ T *aligned_alloc_host(std::size_t alignment, std::size_t count,
 
 inline void *aligned_alloc_shared(std::size_t alignment, std::size_t num_bytes,
                                   const device &dev, const context &ctx) {
-  return detail::select_usm_allocator(ctx, dev)->allocate_usm(num_bytes);
+  return rt::allocate_shared(detail::select_usm_allocator(ctx, dev), num_bytes);
 }
 
 template <typename T>
@@ -224,7 +225,7 @@ T *aligned_alloc(std::size_t alignment, std::size_t count, const sycl::queue &q,
 }
 
 inline void free(void *ptr, const sycl::context &ctx) {
-  return detail::select_usm_allocator(ctx)->free(ptr);
+  return rt::deallocate(detail::select_usm_allocator(ctx), ptr);
 }
 
 inline void free(void *ptr, const sycl::queue &q) {
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index b3da6fee4..d54ca6784 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -270,12 +270,26 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     // This is what allows us to specialize code for different backends.
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing branches post S2 IR constant application...\n";
     IRConstant::optimizeCodeAfterConstantModification(M, MAM);
+
     // Rerun kernel outlining pass so that we don't include unneeded functions
     // that are specific to other backends.
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Reoutlining kernels...\n";
     KernelOutliningPass KP{OutliningEntrypoints};
     KP.run(M, MAM);
 
+    for(auto& P : NoAliasParameters) {
+      auto* F = M.getFunction(P.first);
+      if(F) {
+        for(int i : P.second) {
+          HIPSYCL_DEBUG_INFO << "LLVMToBackend: Attaching noalias attribute to parameter " << i
+                             << " of kernel " << P.first << "\n";
+          if(i < F->getFunctionType()->getNumParams())
+            if(!F->hasParamAttribute(i, llvm::Attribute::AttrKind::NoAlias))
+              F->addParamAttr(i, llvm::Attribute::AttrKind::NoAlias);
+        }
+      }
+    }
+
     // These optimizations should be run before __acpp_sscp_* builtins
     // are resolved, so before backend bitcode libraries are linked. We thus
     // run them prior to flavoring.
@@ -579,6 +593,10 @@ void LLVMToBackendTranslator::specializeFunctionCalls(
   };
 }
 
+void LLVMToBackendTranslator::setNoAliasKernelParam(const std::string &KernelName, int ParamIndex) {
+  NoAliasParameters[KernelName].push_back(ParamIndex);
+}
+
 void LLVMToBackendTranslator::provideExternalSymbolResolver(ExternalSymbolResolver Resolver) {
   this->SymbolResolver = Resolver;
   this->HasExternalSymbolResolver = true;
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index b35e7f4dc..c2e67b9d5 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -23,6 +23,8 @@ set(HIPSYCL_RT_EXTRA_LINKER_FLAGS ${HIPSYCL_RT_EXTRA_LINKER_FLAGS} ${HIPSYCL_STD
 set(CMAKE_INSTALL_RPATH ${base} ${base}/hipSYCL)
 
 add_library(acpp-rt SHARED
+  allocator.cpp
+  allocation_tracker.cpp
   application.cpp
   runtime.cpp
   error.cpp
@@ -36,6 +38,7 @@ add_library(acpp-rt SHARED
   kernel_cache.cpp
   kernel_configuration.cpp
   multi_queue_executor.cpp
+  runtime_event_handlers.cpp
   dag.cpp
   dag_node.cpp
   dag_builder.cpp
diff --git a/src/runtime/adaptivity_engine.cpp b/src/runtime/adaptivity_engine.cpp
index 1fd006f53..9c900ef04 100644
--- a/src/runtime/adaptivity_engine.cpp
+++ b/src/runtime/adaptivity_engine.cpp
@@ -12,10 +12,13 @@
 
 #include "hipSYCL/common/appdb.hpp"
 #include "hipSYCL/glue/llvm-sscp/fcall_specialization.hpp"
+#include "hipSYCL/runtime/allocation_tracker.hpp"
 #include "hipSYCL/runtime/kernel_configuration.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
 #include "hipSYCL/runtime/application.hpp"
 #include "hipSYCL/common/filesystem.hpp"
+#include "hipSYCL/runtime/runtime_event_handlers.hpp"
+#include <cstdint>
 #include <limits>
 
 
@@ -244,6 +247,14 @@ kernel_adaptivity_engine::finalize_binary_configuration(
         std::memcpy(&buffer_value, _arg_mapper.get_mapped_args()[i], arg_size);
         config.set_specialized_kernel_argument(i, buffer_value);
       }
+
+      if (_kernel_info->get_argument_type(i) ==
+          hcf_kernel_info::argument_type::pointer) {
+        if (has_annotation(_kernel_info, i,
+                           hcf_kernel_info::annotation_type::noalias)) {
+          config.set_kernel_param_flag(i, kernel_param_flag::noalias);
+        }
+      }
     }
 
     // Handle auto alignment specialization
@@ -262,9 +273,68 @@ kernel_adaptivity_engine::finalize_binary_configuration(
         }
       }
     }
+
+    if(application::get_settings().get<setting::enable_allocation_tracking>()) {
+      // Detect whether pointer arguments qualify for NoAlias/restrict semantics.
+      // This is achieved by determining the base of the allocations for all pointer
+      // kernel arguments, and checking whether there are other pointer arguments
+      // from the same allocation.
+      constexpr int max_allocations = 32;
+      uint64_t allocation_base_addresses [max_allocations] = {};
+      bool allocations_exceeded = false;
+      for(int alloc_index = 0, i = 0; i < _kernel_info->get_num_parameters(); ++i) {
+        if(_kernel_info->get_argument_type(i) == hcf_kernel_info::argument_type::pointer) {
+          auto arg_size = _kernel_info->get_argument_size(i);
+          if(arg_size == sizeof(void*)) {
+            void* ptr_arg;
+            std::memcpy(&ptr_arg, _arg_mapper.get_mapped_args()[i], arg_size);
+            if (ptr_arg) {
+              allocation_info ainfo;
+              uint64_t allocation_base;
+              if(allocation_tracker::query_allocation(ptr_arg, ainfo, allocation_base)) {
+                allocation_base_addresses[alloc_index] = allocation_base;
+              }
+            }
+          }
+          ++alloc_index;
+          if (alloc_index >= max_allocations) {
+            allocations_exceeded = true;
+            break;
+          }
+        }
+      }
+      if (!allocations_exceeded) {
+        for (int alloc_index = 0, i = 0; i < _kernel_info->get_num_parameters();
+            ++i) {
+          if (_kernel_info->get_argument_type(i) ==
+              hcf_kernel_info::argument_type::pointer) {
+            if (allocation_base_addresses[alloc_index] != 0) {
+              bool argument_might_alias = false;
+              for (int k = 0; k < max_allocations; ++k) {
+                if (k != alloc_index) {
+                  if (allocation_base_addresses[alloc_index] ==
+                      allocation_base_addresses[k]) {
+                    argument_might_alias = true;
+                    break;
+                  }
+                }
+              }
+              if (!argument_might_alias) {
+                HIPSYCL_DEBUG_INFO << "adaptivity_engine: Inferred noalias "
+                                      "pointer semantics for kernel argument "
+                                  << i << std::endl;
+                config.set_kernel_param_flag(i, kernel_param_flag::noalias);
+              }
+            }
+            ++alloc_index;
+          }
+        }
+      }
+    }
   }
   
   if(_adaptivity_level > 1) {
+
     auto base_id = config.generate_id();
     
     // Automatic application of specialization constants by detecting
@@ -329,5 +399,6 @@ std::string kernel_adaptivity_engine::select_image_and_kernels(
     return glue::jit::select_image(_kernel_info, kernel_names_out);
   }
 }
+
 }
 }
\ No newline at end of file
diff --git a/src/runtime/allocation_tracker.cpp b/src/runtime/allocation_tracker.cpp
new file mode 100644
index 000000000..43e3dfee8
--- /dev/null
+++ b/src/runtime/allocation_tracker.cpp
@@ -0,0 +1,46 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/runtime/allocation_tracker.hpp"
+
+
+namespace hipsycl::rt {
+
+namespace {
+
+using amap_t = common::allocation_map<allocation_info>;
+
+amap_t& get_allocation_map() {
+  static amap_t amap;
+  return amap;
+}
+
+}
+
+bool allocation_tracker::register_allocation(const void *ptr, std::size_t size,
+                         const allocation_info &info) {
+  using value_type = amap_t::value_type;
+
+  value_type v;
+  v.allocation_info::operator=(info);
+  v.allocation_size = size;
+  return get_allocation_map().insert(reinterpret_cast<uint64_t>(ptr), v);
+}
+
+bool allocation_tracker::unregister_allocation(const void* ptr) {
+  return get_allocation_map().erase(reinterpret_cast<uint64_t>(ptr));
+}
+
+bool allocation_tracker::query_allocation(const void *ptr, allocation_info &out,
+                                          uint64_t &root_address) {
+  return get_allocation_map().get_entry(reinterpret_cast<uint64_t>(ptr), root_address);
+}
+}
diff --git a/src/runtime/allocator.cpp b/src/runtime/allocator.cpp
new file mode 100644
index 000000000..087650bba
--- /dev/null
+++ b/src/runtime/allocator.cpp
@@ -0,0 +1,61 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/runtime/allocator.hpp"
+#include "hipSYCL/runtime/allocation_tracker.hpp"
+#include "hipSYCL/runtime/application.hpp"
+#include "hipSYCL/runtime/runtime_event_handlers.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+void *allocate_device(backend_allocator *alloc, size_t min_alignment,
+                      size_t size_bytes) {
+  auto *ptr = alloc->raw_allocate(min_alignment, size_bytes);
+  if(ptr) {
+    application::event_handler_layer().on_new_allocation(
+        ptr, size_bytes,
+        allocation_info{alloc->get_device(),
+                        allocation_info::allocation_type::device});
+  }
+  return ptr;
+}
+
+void *allocate_host(backend_allocator *alloc, size_t min_alignment,
+                              size_t bytes) {
+  auto* ptr = alloc->raw_allocate_optimized_host(min_alignment, bytes);
+  if(ptr) {
+    application::event_handler_layer().on_new_allocation(
+        ptr, bytes,
+        allocation_info{alloc->get_device(),
+                        allocation_info::allocation_type::host});
+  }
+  return ptr;
+}
+
+void *allocate_shared(backend_allocator *alloc, size_t bytes) {
+  auto* ptr = alloc->raw_allocate_usm(bytes);
+  if(ptr) {
+    application::event_handler_layer().on_new_allocation(
+        ptr, bytes,
+        allocation_info{alloc->get_device(),
+                        allocation_info::allocation_type::host});
+  }
+  return ptr;
+}
+
+void deallocate(backend_allocator* alloc, void *mem) {
+  alloc->raw_free(mem);
+  application::event_handler_layer().on_deallocation(mem);
+}
+
+}
+}
diff --git a/src/runtime/application.cpp b/src/runtime/application.cpp
index f9ced33c0..1b7cde5b7 100644
--- a/src/runtime/application.cpp
+++ b/src/runtime/application.cpp
@@ -56,6 +56,11 @@ async_error_list& application::errors() {
   return errors;
 }
 
+runtime_event_handlers& application::event_handler_layer() {
+  static runtime_event_handlers h;
+  return h;
+}
+
 
 }
 }
diff --git a/src/runtime/cuda/cuda_allocator.cpp b/src/runtime/cuda/cuda_allocator.cpp
index 97c4f8a0a..4eeacec09 100644
--- a/src/runtime/cuda/cuda_allocator.cpp
+++ b/src/runtime/cuda/cuda_allocator.cpp
@@ -21,7 +21,7 @@ cuda_allocator::cuda_allocator(backend_descriptor desc, int cuda_device)
     : _backend_descriptor{desc}, _dev{cuda_device}
 {}
       
-void *cuda_allocator::allocate(size_t min_alignment, size_t size_bytes)
+void *cuda_allocator::raw_allocate(size_t min_alignment, size_t size_bytes)
 {
   void *ptr;
   cuda_device_manager::get().activate_device(_dev);
@@ -38,8 +38,8 @@ void *cuda_allocator::allocate(size_t min_alignment, size_t size_bytes)
   return ptr;
 }
 
-void *cuda_allocator::allocate_optimized_host(size_t min_alignment,
-                                             size_t bytes) {
+void *cuda_allocator::raw_allocate_optimized_host(size_t min_alignment,
+                                                  size_t bytes) {
   void *ptr;
   cuda_device_manager::get().activate_device(_dev);
 
@@ -55,7 +55,7 @@ void *cuda_allocator::allocate_optimized_host(size_t min_alignment,
   return ptr;
 }
 
-void cuda_allocator::free(void *mem) {
+void cuda_allocator::raw_free(void *mem) {
 
   pointer_info info;
   result query_result = query_pointer(mem, info);
@@ -79,7 +79,7 @@ void cuda_allocator::free(void *mem) {
   }
 }
 
-void * cuda_allocator::allocate_usm(size_t bytes)
+void * cuda_allocator::raw_allocate_usm(size_t bytes)
 {
   cuda_device_manager::get().activate_device(_dev);
   
@@ -157,5 +157,9 @@ result cuda_allocator::mem_advise(const void *addr, std::size_t num_bytes,
   return make_success();
 }
 
+device_id cuda_allocator::get_device() const {
+  return device_id{_backend_descriptor, _dev};
+}
+
 }
 }
diff --git a/src/runtime/dag_direct_scheduler.cpp b/src/runtime/dag_direct_scheduler.cpp
index 72196273b..ce7d6e9c3 100644
--- a/src/runtime/dag_direct_scheduler.cpp
+++ b/src/runtime/dag_direct_scheduler.cpp
@@ -91,7 +91,7 @@ result ensure_allocation_exists(runtime *rt,
     // cause backends to align to the largest supported type.
     // TODO: A better solution might be to select a custom alignment
     // best on sizeof(T). This requires querying backend alignment capabilities.
-    void *ptr = allocator->allocate(0, num_bytes);
+    void *ptr = rt::allocate_device(allocator, 0, num_bytes);
 
     if(!ptr)
       return register_error(
diff --git a/src/runtime/hip/hip_allocator.cpp b/src/runtime/hip/hip_allocator.cpp
index 15fffa047..13335d01c 100644
--- a/src/runtime/hip/hip_allocator.cpp
+++ b/src/runtime/hip/hip_allocator.cpp
@@ -20,7 +20,7 @@ hip_allocator::hip_allocator(backend_descriptor desc, int hip_device)
     : _backend_descriptor{desc}, _dev{hip_device}
 {}
       
-void *hip_allocator::allocate(size_t min_alignment, size_t size_bytes)
+void *hip_allocator::raw_allocate(size_t min_alignment, size_t size_bytes)
 {
   void *ptr;
   hip_device_manager::get().activate_device(_dev);
@@ -37,8 +37,8 @@ void *hip_allocator::allocate(size_t min_alignment, size_t size_bytes)
   return ptr;
 }
 
-void *hip_allocator::allocate_optimized_host(size_t min_alignment,
-                                             size_t bytes) {
+void *hip_allocator::raw_allocate_optimized_host(size_t min_alignment,
+                                                size_t bytes) {
   void *ptr;
   hip_device_manager::get().activate_device(_dev);
 
@@ -54,7 +54,7 @@ void *hip_allocator::allocate_optimized_host(size_t min_alignment,
   return ptr;
 }
 
-void hip_allocator::free(void *mem) {
+void hip_allocator::raw_free(void *mem) {
 
   pointer_info info;
   result query_result = query_pointer(mem, info);
@@ -78,7 +78,7 @@ void hip_allocator::free(void *mem) {
   }
 }
 
-void * hip_allocator::allocate_usm(size_t bytes)
+void * hip_allocator::raw_allocate_usm(size_t bytes)
 {
   hip_device_manager::get().activate_device(_dev);
 
@@ -173,5 +173,9 @@ result hip_allocator::mem_advise(const void *addr, std::size_t num_bytes,
   return make_success();
 }
 
+device_id hip_allocator::get_device() const {
+  return device_id{_backend_descriptor, _dev};
+}
+
 }
 }
diff --git a/src/runtime/kernel_cache.cpp b/src/runtime/kernel_cache.cpp
index 748b620c6..c90c0dd22 100644
--- a/src/runtime/kernel_cache.cpp
+++ b/src/runtime/kernel_cache.cpp
@@ -104,6 +104,8 @@ hcf_kernel_info::hcf_kernel_info(
           } else if(entry.first == "fcall_specialized_config") {
             _known_annotations.back().push_back(
                 annotation_type::fcall_specialized_config);
+          } else if(entry.first == "restrict") {
+            _known_annotations.back().push_back(annotation_type::noalias);
           } else {
             _string_annotations.back().push_back(entry.first);
           }
diff --git a/src/runtime/ocl/ocl_allocator.cpp b/src/runtime/ocl/ocl_allocator.cpp
index 35a3a338c..3312d85d1 100644
--- a/src/runtime/ocl/ocl_allocator.cpp
+++ b/src/runtime/ocl/ocl_allocator.cpp
@@ -17,10 +17,10 @@
 namespace hipsycl {
 namespace rt {
 
-ocl_allocator::ocl_allocator(ocl_usm* usm)
-: _usm{usm} {}
+ocl_allocator::ocl_allocator(rt::device_id dev, ocl_usm* usm)
+: _dev{dev}, _usm{usm} {}
 
-void* ocl_allocator::allocate(size_t min_alignment, size_t size_bytes) {
+void* ocl_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
                    error_info{"ocl_allocator: OpenCL device does not have valid USM provider",
@@ -40,7 +40,7 @@ void* ocl_allocator::allocate(size_t min_alignment, size_t size_bytes) {
   return ptr;
 }
 
-void *ocl_allocator::allocate_optimized_host(size_t min_alignment,
+void *ocl_allocator::raw_allocate_optimized_host(size_t min_alignment,
                                              size_t bytes) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
@@ -60,7 +60,7 @@ void *ocl_allocator::allocate_optimized_host(size_t min_alignment,
   return ptr;
 }
 
-void ocl_allocator::free(void *mem) {
+void ocl_allocator::raw_free(void *mem) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
                    error_info{"ocl_allocator: OpenCL device does not have valid USM provider",
@@ -76,7 +76,7 @@ void ocl_allocator::free(void *mem) {
   }
 }
 
-void *ocl_allocator::allocate_usm(size_t bytes) {
+void *ocl_allocator::raw_allocate_usm(size_t bytes) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
                    error_info{"ocl_allocator: OpenCL device does not have valid USM provider",
@@ -103,6 +103,10 @@ bool ocl_allocator::is_usm_accessible_from(backend_descriptor b) const {
   return b.hw_platform == hardware_platform::ocl;
 }
 
+device_id ocl_allocator::get_device() const {
+  return _dev;
+}
+
 result ocl_allocator::query_pointer(const void* ptr, pointer_info& out) const {
   if(!_usm->is_available()) {
     auto err = make_error(__acpp_here(),
diff --git a/src/runtime/ocl/ocl_hardware_manager.cpp b/src/runtime/ocl/ocl_hardware_manager.cpp
index 86f42c9f5..a1423fcfa 100644
--- a/src/runtime/ocl/ocl_hardware_manager.cpp
+++ b/src/runtime/ocl/ocl_hardware_manager.cpp
@@ -568,7 +568,10 @@ void ocl_hardware_context::init_allocator(ocl_hardware_manager *mgr) {
                              "allocations are not possible on that device."
                           << std::endl;
   }
-  _alloc = ocl_allocator{_usm_provider.get()};
+  device_id dev{
+      backend_descriptor{hardware_platform::ocl, api_platform::ocl},
+      _dev_id};
+  _alloc = ocl_allocator{dev, _usm_provider.get()};
 }
 
 ocl_hardware_manager::ocl_hardware_manager()
diff --git a/src/runtime/omp/omp_allocator.cpp b/src/runtime/omp/omp_allocator.cpp
index 10ee049ca..94234795e 100644
--- a/src/runtime/omp/omp_allocator.cpp
+++ b/src/runtime/omp/omp_allocator.cpp
@@ -21,7 +21,7 @@ namespace rt {
 omp_allocator::omp_allocator(const device_id &my_device)
     : _my_device{my_device} {}
 
-void *omp_allocator::allocate(size_t min_alignment, size_t size_bytes) {
+void *omp_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
 #if !defined(_WIN32)
   // posix requires alignment to be a multiple of sizeof(void*)
   if (min_alignment < sizeof(void*))
@@ -50,12 +50,12 @@ void *omp_allocator::allocate(size_t min_alignment, size_t size_bytes) {
 #endif
 }
 
-void *omp_allocator::allocate_optimized_host(size_t min_alignment,
+void *omp_allocator::raw_allocate_optimized_host(size_t min_alignment,
                                              size_t bytes) {
-  return this->allocate(min_alignment, bytes);
+  return this->raw_allocate(min_alignment, bytes);
 };
 
-void omp_allocator::free(void *mem) {
+void omp_allocator::raw_free(void *mem) {
 #if !defined(_WIN32)
   std::free(mem);
 #else
@@ -63,8 +63,8 @@ void omp_allocator::free(void *mem) {
 #endif
 }
 
-void* omp_allocator::allocate_usm(size_t bytes) {
-  return this->allocate(0, bytes);
+void* omp_allocator::raw_allocate_usm(size_t bytes) {
+  return this->raw_allocate(0, bytes);
 }
 
 bool omp_allocator::is_usm_accessible_from(backend_descriptor b) const {
@@ -74,6 +74,10 @@ bool omp_allocator::is_usm_accessible_from(backend_descriptor b) const {
   return false;
 }
 
+device_id omp_allocator::get_device() const {
+  return _my_device;
+}
+
 result omp_allocator::query_pointer(const void *ptr, pointer_info &out) const {
   
   // For a host device, USM is the same as host memory?
diff --git a/src/runtime/runtime_event_handlers.cpp b/src/runtime/runtime_event_handlers.cpp
new file mode 100644
index 000000000..6e56c63f8
--- /dev/null
+++ b/src/runtime/runtime_event_handlers.cpp
@@ -0,0 +1,42 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+
+#include "hipSYCL/runtime/runtime_event_handlers.hpp"
+#include "hipSYCL/runtime/allocation_tracker.hpp"
+#include "hipSYCL/runtime/application.hpp"
+#include "hipSYCL/runtime/settings.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+runtime_event_handlers::runtime_event_handlers() {
+  _needs_allocation_tracking = application::get_settings().get<
+    setting::enable_allocation_tracking>();
+}
+
+void runtime_event_handlers::on_new_allocation(const void *ptr,
+                                               std::size_t size,
+                                               const allocation_info &info) {
+  if (_needs_allocation_tracking) {
+    allocation_tracker::register_allocation(ptr, size, info);
+  }
+}
+
+
+void runtime_event_handlers::on_deallocation(const void* ptr) {
+  if (_needs_allocation_tracking) {
+    allocation_tracker::unregister_allocation(ptr);
+  }
+}
+
+}
+}
diff --git a/src/runtime/ze/ze_allocator.cpp b/src/runtime/ze/ze_allocator.cpp
index e3567f842..4b870c83a 100644
--- a/src/runtime/ze/ze_allocator.cpp
+++ b/src/runtime/ze/ze_allocator.cpp
@@ -10,6 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include <level_zero/ze_api.h>
 
+#include "hipSYCL/runtime/device_id.hpp"
 #include "hipSYCL/runtime/ze/ze_allocator.hpp"
 #include "hipSYCL/runtime/error.hpp"
 #include "hipSYCL/runtime/util.hpp"
@@ -17,13 +18,18 @@
 namespace hipsycl {
 namespace rt {
 
-ze_allocator::ze_allocator(const ze_hardware_context *device,
+ze_allocator::ze_allocator(std::size_t device_index,
+                           const ze_hardware_context *device,
                            const ze_hardware_manager *hw_manager)
     : _ctx{device->get_ze_context()}, _dev{device->get_ze_device()},
       _global_mem_ordinal{device->get_ze_global_memory_ordinal()},
-      _hw_manager{hw_manager} {}
+      _hw_manager{hw_manager} {
+  _dev_id = device_id{backend_descriptor{hardware_platform::level_zero,
+                                         api_platform::level_zero},
+                      static_cast<int>(device_index)};
+}
 
-void* ze_allocator::allocate(size_t min_alignment, size_t size_bytes) {
+void* ze_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
   
   void* out = nullptr;
 
@@ -47,8 +53,8 @@ void* ze_allocator::allocate(size_t min_alignment, size_t size_bytes) {
   return out;
 }
 
-void* ze_allocator::allocate_optimized_host(size_t min_alignment,
-                                            size_t bytes) {
+void* ze_allocator::raw_allocate_optimized_host(size_t min_alignment,
+                                                size_t bytes) {
   void* out = nullptr;
   ze_host_mem_alloc_desc_t desc;
   
@@ -69,7 +75,7 @@ void* ze_allocator::allocate_optimized_host(size_t min_alignment,
   return out;
 }
   
-void ze_allocator::free(void *mem) {
+void ze_allocator::raw_free(void *mem) {
   ze_result_t err = zeMemFree(_ctx, mem);
 
   if(err != ZE_RESULT_SUCCESS) {
@@ -79,7 +85,7 @@ void ze_allocator::free(void *mem) {
   }
 }
 
-void* ze_allocator::allocate_usm(size_t bytes) {
+void* ze_allocator::raw_allocate_usm(size_t bytes) {
 
   void* out = nullptr;
 
@@ -167,5 +173,9 @@ result ze_allocator::mem_advise(const void *addr, std::size_t num_bytes,
   return make_success();
 }
 
+device_id ze_allocator::get_device() const {
+  return _dev_id;
+}
+
 }
 }
diff --git a/src/runtime/ze/ze_backend.cpp b/src/runtime/ze/ze_backend.cpp
index c49334a9e..1cfac7aab 100644
--- a/src/runtime/ze/ze_backend.cpp
+++ b/src/runtime/ze/ze_backend.cpp
@@ -58,7 +58,7 @@ ze_backend::ze_backend() {
 
   _hardware_manager = std::make_unique<ze_hardware_manager>();
   for(std::size_t i = 0; i < _hardware_manager->get_num_devices(); ++i) {
-    _allocators.push_back(ze_allocator{
+    _allocators.push_back(ze_allocator{i,
         static_cast<ze_hardware_context *>(_hardware_manager->get_device(i)),
         _hardware_manager.get()});
   }
diff --git a/tests/pstl/allocation_map.cpp b/tests/pstl/allocation_map.cpp
index 3241eb877..dbc96fe8f 100644
--- a/tests/pstl/allocation_map.cpp
+++ b/tests/pstl/allocation_map.cpp
@@ -24,7 +24,8 @@
 
 BOOST_AUTO_TEST_SUITE(pstl_allocation_map)
 
-using amap_t = hipsycl::stdpar::allocation_map<>;
+struct payload{};
+using amap_t = hipsycl::common::allocation_map<payload>;
 
 template<class F>
 void for_each_test_allocation(std::size_t n, F&& f) {
diff --git a/tests/pstl/free_space_map.cpp b/tests/pstl/free_space_map.cpp
index 894febd9f..960368969 100644
--- a/tests/pstl/free_space_map.cpp
+++ b/tests/pstl/free_space_map.cpp
@@ -24,7 +24,9 @@
 
 BOOST_AUTO_TEST_SUITE(pstl_free_space_map)
 
-using amap_t = hipsycl::stdpar::allocation_map<>;
+struct payload {};
+
+using amap_t = hipsycl::stdpar::allocation_map<payload>;
 using fmap_t = hipsycl::stdpar::free_space_map;
 
 uint64_t next_pow2(uint64_t x) {

From 05fbc4b33db1627abece2f0955c701b64fc32185 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Thu, 28 Nov 2024 19:40:23 +0000
Subject: [PATCH 055/126] use a whitelist instead

---
 bin/acpp | 45 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/bin/acpp b/bin/acpp
index ee072c903..11cb203dd 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -935,18 +935,51 @@ class acpp_config:
   def is_pure_linking_stage(self):
     return len(self.source_file_arguments) == 0
 
-def run_or_print(command, print_only, noplugin=False):
+def filter_cmd_args(command, verbose = False):
+  new_cmd = []
+
+  whitelist = [
+    "-I", "-D", "-W", "-std=","-pedantic-errors"
+  ]
+
+  whitelist_enable_next = [
+    "-isystem", "-o", "-c"
+  ]
+
+  add_next_arg = True # to add clang call
+  for arg in command: 
+    if add_next_arg:
+      new_cmd.append(arg)
+      add_next_arg = False
+      continue
+    
+    for w in whitelist:
+      if arg.startswith(w):
+        new_cmd.append(arg)
+        add_next_arg = False
+        continue
 
-  if(noplugin):
-    new_cmd = []
-    for arg in command:
-      if not (arg.startswith("-fplugin=") or arg.startswith("-fpass-plugin")):
+    for w in whitelist_enable_next:
+      if arg.startswith(w):
         new_cmd.append(arg)
-    command = new_cmd
+        add_next_arg = True
+        continue
+
+  if verbose:
+    for arg in command:
+      if not arg in new_cmd:
+        print("removed :",arg)
+
+  return new_cmd
+
+
+def run_or_print(command, print_only, noplugin=False):
 
   if not print_only:
     return subprocess.call(command)
   else:
+    if(noplugin):
+      command = filter_cmd_args(command,verbose=True)
     print(' '.join(command))
     return 0
 

From 7ddf6f77476616ed90e1a13e75fbd7083ef1774e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Thu, 28 Nov 2024 19:41:05 +0000
Subject: [PATCH 056/126] disable verbose

---
 bin/acpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/acpp b/bin/acpp
index 11cb203dd..135348557 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -979,7 +979,7 @@ def run_or_print(command, print_only, noplugin=False):
     return subprocess.call(command)
   else:
     if(noplugin):
-      command = filter_cmd_args(command,verbose=True)
+      command = filter_cmd_args(command,verbose=False)
     print(' '.join(command))
     return 0
 

From b2a1ad34b6adf70571c272d076f5389dba83acfd Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 28 Nov 2024 20:58:32 +0100
Subject: [PATCH 057/126] [NFC][doc] Example: Fix incorrect usage of T (#1620)

---
 doc/examples.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/examples.md b/doc/examples.md
index 93d14b69c..7c1c57ad8 100644
--- a/doc/examples.md
+++ b/doc/examples.md
@@ -18,15 +18,15 @@ std::vector<data_type> add(sycl::queue& q,
   data_type* dev_b = sycl::malloc_device<data_type>(a.size(), q);
   data_type* dev_c = sycl::malloc_device<data_type>(a.size(), q);
 
-  q.memcpy(dev_a, a.data(), sizeof(T) * a.size());
-  q.memcpy(dev_b, b.data(), sizeof(T) * b.size());
-  q.memcpy(dev_c, c.data(), sizeof(T) * c.size());
+  q.memcpy(dev_a, a.data(), sizeof(data_type) * a.size());
+  q.memcpy(dev_b, b.data(), sizeof(data_type) * b.size());
+  q.memcpy(dev_c, c.data(), sizeof(data_type) * c.size());
 
   q.parallel_for(a.size(), [=](sycl::id<1> idx){
     dev_c[idx] = dev_a[idx] + dev_b[idx];
   });
 
-  q.memcpy(c.data(), dev_c, sizeof(T) * c.size());
+  q.memcpy(c.data(), dev_c, sizeof(data_type) * c.size());
   q.wait();
 
   sycl::free(dev_a, q);

From 1d7709b1c3b5a4f7fd0ce3d9edba829fec6e92a4 Mon Sep 17 00:00:00 2001
From: carbotaniuman <41451839+carbotaniuman@users.noreply.github.com>
Date: Sun, 1 Dec 2024 07:23:49 -0800
Subject: [PATCH 058/126] Fix double negation for `has_system_svm` (#1623)

---
 src/runtime/ocl/ocl_hardware_manager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/ocl/ocl_hardware_manager.cpp b/src/runtime/ocl/ocl_hardware_manager.cpp
index a1423fcfa..8747063a1 100644
--- a/src/runtime/ocl/ocl_hardware_manager.cpp
+++ b/src/runtime/ocl/ocl_hardware_manager.cpp
@@ -152,7 +152,7 @@ bool should_include_device(const std::string& dev_name, const cl::Device& dev) {
       info_query<CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities>(dev);
 
   bool has_usm_extension = info_query<CL_DEVICE_EXTENSIONS, std::string>(dev).find("cl_intel_unified_shared_memory") != std::string::npos;
-  bool has_system_svm = !(cap & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
+  bool has_system_svm = cap & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
 
   if(!has_usm_extension && !has_system_svm) {
     HIPSYCL_DEBUG_WARNING << "ocl_hardware_manager: OpenCL device '" << dev_name

From 076b6f2761b4588a5ec082f60e516b6316dce4e9 Mon Sep 17 00:00:00 2001
From: Marco Julian Solanki <173357676+marcosolanki@users.noreply.github.com>
Date: Sun, 1 Dec 2024 16:24:19 +0100
Subject: [PATCH 059/126] Increment `cmake_minimum_required` to 3.10 to avoid
 deprecation warnings (#1622)

---
 CMakeLists.txt              | 2 +-
 examples/CMakeLists.txt     | 2 +-
 src/compiler/CMakeLists.txt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b45a621d..02b9e4529 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 3.9)
+cmake_minimum_required(VERSION 3.10)
 if(NOT CMAKE_VERSION VERSION_LESS 3.12)
   cmake_policy(SET CMP0074 NEW) # Don't complain about using BOOST_ROOT...
 endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c6de126bf..949222073 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 3.5)
+cmake_minimum_required(VERSION 3.10)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
diff --git a/src/compiler/CMakeLists.txt b/src/compiler/CMakeLists.txt
index b60b7a05c..b108eec0b 100644
--- a/src/compiler/CMakeLists.txt
+++ b/src/compiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.10)
 project(acpp-clang)
 
 get_filename_component(CLANG_BINARY_PREFIX ${CLANG_EXECUTABLE_PATH} DIRECTORY)

From b66bbd71d4e58886b6e9cec118558a2429003db6 Mon Sep 17 00:00:00 2001
From: normallytangent <7634457+normallytangent@users.noreply.github.com>
Date: Sun, 1 Dec 2024 19:44:31 +0100
Subject: [PATCH 060/126] Log debug output to stderr (#1267)

---
 include/hipSYCL/common/debug.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hipSYCL/common/debug.hpp b/include/hipSYCL/common/debug.hpp
index 3f2c92aa6..e393e2840 100644
--- a/include/hipSYCL/common/debug.hpp
+++ b/include/hipSYCL/common/debug.hpp
@@ -45,7 +45,7 @@ class output_stream {
 private:
 
   output_stream()
-  : _debug_level {HIPSYCL_DEBUG_LEVEL}, _output_stream{std::cout} {
+  : _debug_level {HIPSYCL_DEBUG_LEVEL}, _output_stream{std::cerr} {
 #if !defined(HIPSYCL_COMPILER_COMPONENT) && !defined(HIPSYCL_TOOL_COMPONENT)
     _debug_level =
         rt::application::get_settings().get<rt::setting::debug_level>();

From e55293aeb5928991d3b6455aa0a9eebcebd8d14c Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 2 Dec 2024 05:29:20 +0100
Subject: [PATCH 061/126] Add support for stage 2 IR dumping (#1621)

---
 doc/env_variables.md                          |  41 +++++++
 .../llvm-to-backend/LLVMToBackend.hpp         |   2 +
 .../llvm-to-backend/LLVMToBackend.cpp         | 107 +++++++++++++++++-
 3 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/doc/env_variables.md b/doc/env_variables.md
index 62e17a4d3..618bc553e 100644
--- a/doc/env_variables.md
+++ b/doc/env_variables.md
@@ -33,3 +33,44 @@
 * `ACPP_JITOPT_IADS_RELATIVE_THRESHOLD_MIN_DATA`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): Only consider kernels with at least many invocations for the relative threshold described above. Default: 1024.
 * `ACPP_JITOPT_IADS_RELATIVE_EVICTION_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): If the relative frequency of a kernel argument value falls below this threshold, the statistics entry for the the argument value may be evicted if space for other values is needed.
 * `ACPP_ENABLE_ALLOCATION_TRACKING`: If set to 1, allows the AdaptiveCpp runtime to track and register the allocations that it manages. This enables additional JIT-time optimizations. Set to 0 to disable. (Default: 0)
+
+## Environment variables to control dumping IR during JIT compilation
+
+AdaptiveCpp can dump the IR of the code during stage 2 compilation (JIT compilation) at various stages in the processing and optimization pipeline.
+This feature only applies to the AdaptiveCpp generic JIT SSCP compiler (`--acpp-targets=generic`).
+
+It is primarily helpful for AdaptiveCpp developers for debugging or expert users who wish to understand how their input code is translated and processed in LLVM IR.
+
+These environment variables take the shape `ACPP_S2_DUMP_IR_<Stage>` for various stages in the optimization process.
+* If the variable is set to `1`, the IR will be stored in `<original-sourcefile>.ll`.
+* Otherwise, the content is interpreted as a filepath were the IR will be written to.
+
+Within one application run, AdaptiveCpp appends IR dumps to the dump file. When a new application run results in new dumps being generated to the same file, the file will be truncated first.
+
+Available stages for dumping:
+* `ACPP_S2_IR_DUMP_INPUT` - dumps the raw, unoptimized generic input LLVM IR
+* `ACPP_S2_IR_DUMP_INITIAL_OUTLINING` - After initial kernel outlining
+* `ACPP_S2_IR_DUMP_SPECIALIZATION` - After applying specializations to the kernel
+* `ACPP_S2_IR_DUMP_REFLECTION` - After processing JIT-time reflection queries
+* `ACPP_S2_IR_DUMP_JIT_OPTIMIZATIONS` - After processing optimizations that rely on JIT-time information`
+* `ACPP_S2_IR_DUMP_BACKEND_FLAVORING` - After applying the "backend flavor", i.e. turning generic LLVM IR into IR that targets a specific backend.
+* `ACPP_S2_IR_DUMP_FULL_OPTIMIZATIONS` - After running the full LLVM optimization pipeline on the code.
+* `ACPP_S2_IR_DUMP_FINAL` - Final state of the LLVM IR before handing it off to lowering it to backend-specific formats (e.g. PTX, amdgcn ISA, SPIR-V).
+* `ACPP_S2_IR_DUMP_ALL` - Dump all stages.
+
+A dump section for a stage in the dump file will take the following form:
+```
+;---------------- Begin AdaptiveCpp IR dump --------------
+; AdaptiveCpp SSCP S2 IR dump; Compiling kernels: (KERNELS), stage: (STAGENAME)
+
+(LLVM code here)
+;----------------- End AdaptiveCpp IR dump ---------------
+```
+`(STAGENAME)` refers to to one of the stages listed above. `(KERNELS)` is an identifier that describes which kernels AdaptiveCpp is compiling in this IR. It contains the mangled function name of the kernels.
+
+In general, the dump file will contain multiple dump sections if dumping is enabled for multiple stages, or if multiple JIT compilations are triggered (e.g. multiple kernels are launched).
+
+If `ACPP_S2_DUMP_IR_FILTER` filter is non-empty, AdaptiveCpp will only dump IR if the kernel identifier corresponds to the one specified in this variable.
+Note that this can still lead to multiple JIT compilation dumps, e.g. if AdaptiveCpp generates multiple specialized kernels based on runtime information for one C++ kernel.
+
+
diff --git a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
index 977fa975c..e987aea1e 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
@@ -216,6 +216,8 @@ class LLVMToBackendTranslator {
   void runKernelDeadArgumentElimination(llvm::Module &M, llvm::Function *F, PassHandler &PH,
                                         std::vector<int>& RetainedIndicesOut);
 
+  std::string getCompilationIdentifier() const;
+
   int S2IRConstantBackendId;
   
   std::vector<std::string> OutliningEntrypoints;
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index d54ca6784..0a7f50635 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -41,15 +41,97 @@
 #include <llvm/Linker/Linker.h>
 #include <llvm/Passes/PassBuilder.h>
 #include <llvm/Support/Error.h>
+#include <llvm/Support/FileSystem.h>
 #include <llvm/Bitcode/BitcodeWriter.h>
 #include <llvm/Transforms/IPO/AlwaysInliner.h>
 #include <string>
+#include <optional>
+#include <cstdlib>
+#include <sstream>
+#include <unordered_set>
 
 namespace hipsycl {
 namespace compiler {
 
 namespace {
 
+template<class T>
+std::optional<T> getEnvironmentVariable(const std::string& Name) {
+  std::string EnvName = Name;
+  std::transform(EnvName.begin(), EnvName.end(), EnvName.begin(), ::toupper);
+
+  if(const char* EnvVal = std::getenv(("ACPP_S2_"+EnvName).c_str())) {
+    T val;
+    std::stringstream sstr{std::string{EnvVal}};
+    sstr >> val;
+    if (!sstr.fail() && !sstr.bad()) {
+      return val;
+    }
+  }
+  return {};
+}
+
+template<class T>
+T getEnvironmentVariableOrDefault(const std::string& Name,
+                                      const T& Default) {
+  std::optional<T> v = getEnvironmentVariable<T>(Name);
+  if(v.has_value()) {
+    return v.value();
+  }
+  return Default;
+}
+
+void printModuleToFile(llvm::Module& M, const std::string& File,
+                      const std::string& Header){
+
+  // Desired behavior is to truncate files for each application run,
+  // but append content in the dump file within one application run.
+  static std::unordered_set<std::string> UsedFiles;
+  auto OpenFlag = llvm::sys::fs::OpenFlags::OF_Append;
+  if(UsedFiles.find(File) == UsedFiles.end()) {
+    OpenFlag = llvm::sys::fs::OpenFlags::OF_None;
+    UsedFiles.insert(File);
+  }
+
+  std::error_code EC;
+  llvm::raw_fd_ostream Out{File, EC, OpenFlag};
+  Out << ";---------------- Begin AdaptiveCpp IR dump --------------\n";
+  Out << Header;
+  M.print(Out, nullptr);
+  Out << ";----------------- End AdaptiveCpp IR dump ---------------\n";
+}
+
+void enableModuleStateDumping(llvm::Module &M, const std::string &PipelineStage,
+                              const std::string &Kernels) {
+  std::string Filter =
+      getEnvironmentVariableOrDefault<std::string>("DUMP_IR_FILTER", "");
+
+  std::string FallbackFileName = M.getSourceFileName()+".ll";
+  std::string FileName =
+      getEnvironmentVariableOrDefault<std::string>("DUMP_IR_" + PipelineStage, "");
+
+  if(FileName == "1")
+    FileName = FallbackFileName;
+  
+  std::string Header =
+      "; AdaptiveCpp SSCP S2 IR dump; Compiling kernels: " + Kernels + ", stage: " + PipelineStage + "\n";
+
+  if(FileName.length() != 0) {
+    if(Kernels == Filter || Filter.empty())
+      printModuleToFile(M, FileName, Header);
+  }
+
+  std::string AllFileName =
+      getEnvironmentVariableOrDefault<std::string>("DUMP_IR_ALL", "");
+  if(AllFileName == "1")
+    AllFileName = FallbackFileName;
+
+  if(AllFileName.length() != 0 && AllFileName != FileName) {
+    if(Kernels == Filter || Filter.empty())
+      printModuleToFile(M, AllFileName, Header);
+  }
+}
+
 bool linkBitcode(llvm::Module &M, std::unique_ptr<llvm::Module> OtherM,
                    const std::string &ForcedTriple = "",
                    const std::string &ForcedDataLayout = "",
@@ -221,6 +303,7 @@ bool LLVMToBackendTranslator::fullTransformation(const std::string &LLVMIR, std:
 }
 
 bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
+  enableModuleStateDumping(M, "input", getCompilationIdentifier());
 
   HIPSYCL_DEBUG_INFO << "LLVMToBackend: Preparing backend flavoring...\n";
 
@@ -238,7 +321,7 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
       InitialOutliningEntrypoints.push_back(FName);
     KernelOutliningPass InitialOutlining{InitialOutliningEntrypoints};
     InitialOutlining.run(M, MAM);
-    
+    enableModuleStateDumping(M, "initial_outlining", getCompilationIdentifier());
     // We need to resolve symbols now instead of after optimization, because we
     // may have to reoutline if the code that is linked in after symbol resolution
     // depends on IR constants.
@@ -256,6 +339,8 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     // Return error in case applying specializations has caused error list to be populated
     if(!Errors.empty())
       return false;
+    
+    enableModuleStateDumping(M, "specialization", getCompilationIdentifier());
 
     // Process stage 2 reflection calls
     ReflectionFields["compiler_backend"] = this->getBackendId();
@@ -266,6 +351,8 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     ProcessS2ReflectionPass S2RP{ReflectionFields};
     S2RP.run(M, MAM);
 
+    enableModuleStateDumping(M, "reflection", getCompilationIdentifier());
+
     // Optimize away unnecessary branches due to backend-specific S2IR constants
     // This is what allows us to specialize code for different backends.
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing branches post S2 IR constant application...\n";
@@ -319,12 +406,16 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     InstructionCleanupPass ICP;
     ICP.run(M, MAM);
 
+    enableModuleStateDumping(M, "jit_optimizations", getCompilationIdentifier());
+
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Adding backend-specific flavor to IR...\n";
     if(!this->toBackendFlavor(M, PH)) {
       HIPSYCL_DEBUG_INFO << "LLVMToBackend: Flavoring failed\n";
       return false;
     }
 
+    enableModuleStateDumping(M, "backend_flavoring", getCompilationIdentifier());
+
     // Inline again to handle builtin definitions pulled in by backend flavors
     InliningPass.run(M, MAM);
 
@@ -355,6 +446,10 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     }
     llvm::AlwaysInlinerPass{}.run(M, MAM);
 
+    enableModuleStateDumping(M, "full_optimizations", getCompilationIdentifier());
+    
+    enableModuleStateDumping(M, "final", getCompilationIdentifier());
+
     bool ContainsUnsetIRConstants = false;
     S2IRConstant::forEachS2IRConstant(M, [&](S2IRConstant C) {
       if (C.isValid()) {
@@ -724,6 +819,16 @@ void LLVMToBackendTranslator::setReflectionField(const std::string &str, uint64_
   ReflectionFields[str] = value;
 }
 
+std::string LLVMToBackendTranslator::getCompilationIdentifier() const {
+  std::string Result;
+  for(const auto& K : Kernels) {
+    Result += "<Kernel:"+K+">";
+  }
+  if(Result.empty())
+    return "<no-kernels>";
+  return Result;
+}
+
 }
 }
 

From d488e2a99841154c19d41bf435009bab45c1d28d Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Tue, 3 Dec 2024 04:46:41 +0000
Subject: [PATCH 062/126] Fix pointer to a temporary returned from
 kernel_configuration::data_ptr (#1625)

---
 include/hipSYCL/runtime/kernel_configuration.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/hipSYCL/runtime/kernel_configuration.hpp b/include/hipSYCL/runtime/kernel_configuration.hpp
index 4bfc11932..99521833e 100644
--- a/include/hipSYCL/runtime/kernel_configuration.hpp
+++ b/include/hipSYCL/runtime/kernel_configuration.hpp
@@ -256,7 +256,7 @@ class kernel_configuration {
 
 private:
   static const void* data_ptr(const char* data) {
-    return data_ptr(std::string{data});
+    return data_ptr(data);
   }
 
   static const void* data_ptr(const std::string& data) {
@@ -278,7 +278,7 @@ class kernel_configuration {
   }
 
   static std::size_t data_size(const char* data) {
-    return data_size(std::string{data});
+    return data_size(std::string_view{data});
   }
 
   static std::size_t data_size(const std::string& data) {

From 2a5978f9da38a86be9be927d7b52e436876c43ac Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 25 Nov 2024 06:19:17 +0100
Subject: [PATCH 063/126] [stdpar] Add inclusive_scan and exclusive_scan

---
 doc/stdpar.md                                 |   4 +-
 include/hipSYCL/algorithms/numeric.hpp        |  90 +++++
 .../scan/decoupled_lookback_scan.hpp          | 364 ++++++++++++++++++
 .../stdpar/detail/offload_heuristic_db.hpp    |   2 +
 .../hipSYCL/std/stdpar/pstl-impl/numeric.hpp  | 347 +++++++++++++++++
 tests/CMakeLists.txt                          |   1 +
 tests/pstl/inclusive_scan.cpp                 | 201 ++++++++++
 7 files changed, 1008 insertions(+), 1 deletion(-)
 create mode 100644 include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
 create mode 100644 tests/pstl/inclusive_scan.cpp

diff --git a/doc/stdpar.md b/doc/stdpar.md
index 4ff93e9fd..060a1c280 100644
--- a/doc/stdpar.md
+++ b/doc/stdpar.md
@@ -43,7 +43,9 @@ Offloading is implemented for the following STL algorithms:
 |`all_of` | |
 |`none_of` | |
 |`merge` | |
-|`sort` | |
+|`sort` | may not scale optimally for large problems |
+|`inclusive_scan` | |
+|`exclusive_scan` | |
 
 
 For all other execution policies or algorithms, the algorithm will compile and execute correctly, however the regular host implementation of the algorithm provided by the C++ standard library implementation will be invoked and no offloading takes place.
diff --git a/include/hipSYCL/algorithms/numeric.hpp b/include/hipSYCL/algorithms/numeric.hpp
index bf3bbbf3d..a2fb96611 100644
--- a/include/hipSYCL/algorithms/numeric.hpp
+++ b/include/hipSYCL/algorithms/numeric.hpp
@@ -23,8 +23,10 @@
 #include "hipSYCL/sycl/queue.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_descriptor.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_engine.hpp"
+#include "hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
 
+
 namespace hipsycl::algorithms {
 
 namespace detail {
@@ -272,6 +274,94 @@ sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                 typename std::iterator_traits<ForwardIt>::value_type{});
 }
 
+///////////////////////////// scans /////////////////////////////////////
+
+namespace detail {
+
+template <bool IsInclusive, class InputIt, class OutputIt, class BinaryOp,
+          class OptionalInitT>
+sycl::event scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+                 OptionalInitT init,
+                 const std::vector<sycl::event> &deps = {}) {
+
+  auto generator = [=](auto idx, auto effective_group_id, auto effective_global_id,
+                 auto problem_size) {
+    if(effective_global_id >= problem_size)
+      effective_global_id = problem_size - 1;
+
+    InputIt it = first;
+    std::advance(it, effective_global_id);
+    return *it;
+  };
+  auto result_processor = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size,
+                       auto value) {
+    if (effective_global_id < problem_size) {
+      OutputIt it = d_first;
+      std::advance(it, effective_global_id);
+      *it = value;
+    }
+  };
+
+  std::size_t problem_size = std::distance(first, last);
+  std::size_t group_size = 128;
+
+  using T = std::decay_t<decltype(*first)>;
+  return scanning::decoupled_lookback_scan<IsInclusive, T>(
+      q, scratch_allocations, generator, result_processor, op, problem_size,
+      group_size, init, deps);
+}
+
+} // detail
+
+template <class InputIt, class OutputIt, class BinaryOp>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               const std::vector<sycl::event> &deps = {}) {
+
+  return detail::scan<true>(q, scratch_allocations, first, last, d_first, op,
+                            std::nullopt, deps);
 }
 
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               T init, const std::vector<sycl::event> &deps = {}) {
+  return detail::scan<true>(q, scratch_allocations, first, last, d_first, op,
+                            init, deps);
+}
+
+template <class InputIt, class OutputIt>
+sycl::event inclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           const std::vector<sycl::event> &deps = {}) {
+  return inclusive_scan(q, scratch_allocations, first, last, d_first,
+                        std::plus<>{}, deps);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+sycl::event
+exclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op, const std::vector<sycl::event> &deps = {}) {
+  return detail::scan<false>(q, scratch_allocations, first, last, d_first, op,
+                             init, deps);
+}
+
+template <class InputIt, class OutputIt, class T>
+sycl::event exclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init, const std::vector<sycl::event> &deps = {}) {
+  return exclusive_scan(q, scratch_allocations, first, last, d_first, init,
+                        std::plus<>{}, deps);
+}
+
+} // algorithms
+
+
 #endif
diff --git a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
new file mode 100644
index 000000000..4ae9a74a8
--- /dev/null
+++ b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
@@ -0,0 +1,364 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_DECOUPLED_LOOKBACK_SCAN_HPP
+#define ACPP_ALGORITHMS_DECOUPLED_LOOKBACK_SCAN_HPP
+
+#include <iterator>
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+#include <optional>
+#include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/sycl/libkernel/atomic_ref.hpp"
+#include "hipSYCL/sycl/libkernel/group_functions.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
+
+namespace hipsycl::algorithms::scanning {
+
+namespace detail {
+
+enum class status : uint32_t {
+  invalid = 0,
+  aggregate_available = 1,
+  prefix_available = 2
+};
+
+template<class T>
+struct scratch_data {
+  scratch_data(util::allocation_group &scratch, std::size_t num_groups) {
+    group_aggregate = scratch.obtain<T>(num_groups);
+    inclusive_prefix = scratch.obtain<T>(num_groups);
+    group_status = scratch.obtain<status>(num_groups);
+  }
+
+  T* group_aggregate;
+  T* inclusive_prefix;
+  status* group_status;
+};
+
+template<class T, class BinaryOp>
+constexpr bool can_use_group_algorithms() {
+  // TODO
+  return false;
+}
+
+template <class T, class BinaryOp>
+T collective_inclusive_group_scan(sycl::nd_item<1> idx, T my_element,
+                                  BinaryOp op, T *local_mem) {
+  if constexpr(can_use_group_algorithms<T, BinaryOp>()) {
+    // TODO
+  } else {
+    int lid = idx.get_local_linear_id();
+    local_mem[lid] = my_element;
+    sycl::group_barrier(idx.get_group());
+
+    // TODO Improve this
+    if(lid == 0) {
+      T current = local_mem[0];
+      for(int i = 1; i < idx.get_local_range().size(); ++i) {
+        current = op(current, local_mem[i]);
+        local_mem[i] = current;
+      }
+    }
+    sycl::group_barrier(idx.get_group());
+    return local_mem[lid];
+  }
+}
+
+template<class T, class BinaryOp>
+T collective_broadcast(sycl::nd_item<1> idx, T x, int local_id, T* local_mem) {
+  if constexpr(can_use_group_algorithms<T, BinaryOp>()) {
+    // TODO
+  } else {
+    if(idx.get_local_linear_id() == local_id) {
+      *local_mem = x;
+    }
+    sycl::group_barrier(idx.get_group());
+    return *local_mem;
+  }
+}
+
+template <class T, class BinaryOp>
+T exclusive_prefix_look_back(const T &dummy_init, int effective_group_id,
+                             detail::status *status, T *group_aggregate,
+                             T *inclusive_prefix, BinaryOp op) {
+  // dummy_init is a dummy value here; avoid relying on default constructor
+  // in case T has none.
+  T exclusive_prefix = dummy_init;
+  bool exclusive_prefix_initialized = false;
+
+  auto update_exclusive_prefix = [&](auto x){
+    if(!exclusive_prefix_initialized) {
+      exclusive_prefix = x;
+      exclusive_prefix_initialized = true;
+    } else {
+      exclusive_prefix = op(x, exclusive_prefix);
+    }
+  };
+
+  for(int lookback_group = effective_group_id - 1; lookback_group > 0; --lookback_group) {
+    uint32_t& status_ptr = reinterpret_cast<uint32_t&>(status[lookback_group]);
+    sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
+                  sycl::memory_scope::device,
+                  sycl::access::address_space::global_space> status_ref{status_ptr};
+
+    detail::status lookback_status;
+    while ((lookback_status = static_cast<detail::status>(status_ref.load())) ==
+            detail::status::invalid)
+      ;
+    
+    if(lookback_status == detail::status::prefix_available) {
+      update_exclusive_prefix(inclusive_prefix[lookback_group]);
+      return exclusive_prefix;
+    } else {
+      update_exclusive_prefix(group_aggregate[lookback_group]);
+    }
+  }
+
+  return exclusive_prefix;
+}
+
+template <bool IsInclusive, class T, class Generator, class OptionalInitT,
+          class BinaryOp>
+T load_data_element(Generator &&gen, sycl::nd_item<1> idx, BinaryOp op,
+                    uint32_t effective_group_id, std::size_t global_id,
+                    std::size_t problem_size, OptionalInitT init) {
+  if constexpr (IsInclusive) {
+    auto elem = gen(idx, effective_group_id, global_id, problem_size);
+    if constexpr(!std::is_same_v<OptionalInitT, std::nullopt_t>) {
+      if(global_id == 0) {
+        return op(init, elem);
+      }
+    }
+    return elem;
+  } else {
+    if(global_id == 0)
+      return init;
+    return gen(idx, effective_group_id, global_id - 1, problem_size);
+  }
+}
+
+template <bool IsInclusive, class T, class OptionalInitT, class BinaryOp,
+          class Generator, class Processor>
+void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
+                 uint32_t *group_counter, BinaryOp op, OptionalInitT init,
+                 std::size_t problem_size, Generator &&gen,
+                 Processor &&processor) {
+  sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                    sycl::memory_scope::device,
+                    sycl::access::address_space::global_space>
+      group_id_counter{*group_counter};
+
+  const int local_id = idx.get_local_linear_id();
+  uint32_t effective_group_id = idx.get_group_linear_id();
+  if(local_id == 0) {
+    effective_group_id = group_id_counter.fetch_add(static_cast<uint32_t>(1));
+  }
+  effective_group_id = collective_broadcast<uint32_t, BinaryOp>(
+      idx, effective_group_id, 0, reinterpret_cast<uint32_t*>(local_memory));
+
+  const std::size_t global_id = effective_group_id * idx.get_local_range().size() +
+                          local_id;
+  
+  int local_size = idx.get_local_range().size();
+  
+  std::size_t num_groups = idx.get_group_range().size();
+  const bool is_last_group = effective_group_id == num_groups - 1;
+  if(is_last_group) {
+    std::size_t group_offset = effective_group_id * (num_groups - 1) + local_size;
+    local_size = problem_size - group_offset;
+  }
+
+  // This invokes gen for the current work item to obtain our data element
+  // for the scan. If we are dealing with an exclusive scan, load_data_element
+  // shifts the data access by 1, thus allowing us to treat the scan as inclusive
+  // in the subsequent algorithm.
+  // It also applies init to the first data element, if provided.
+  T my_element = load_data_element<IsInclusive, T>(
+      gen, idx, op, effective_group_id, global_id, problem_size, init);
+  
+  // The exclusive scan case is handled in load_element() by accessing the element
+  // at global_id-1 instead of global_id.
+  T local_scan_result =
+          collective_inclusive_group_scan(idx, my_element, op, local_memory);
+
+  if(local_id == local_size - 1) {
+    T group_aggregate =
+        IsInclusive ? local_scan_result : op(local_scan_result, my_element);
+
+    uint32_t *status_ptr =
+        reinterpret_cast<uint32_t *>(&scratch.group_status[effective_group_id]);
+    sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
+                  sycl::memory_scope::device,
+                  sycl::access::address_space::global_space> status_ref{*status_ptr};
+    
+    if(effective_group_id == 0) {
+      scratch.group_aggregate[effective_group_id] = group_aggregate;
+      scratch.group_aggregate[effective_group_id] = group_aggregate;
+      status_ref.store(static_cast<uint32_t>(status::prefix_available));
+    } else {
+      scratch.group_aggregate[effective_group_id] = group_aggregate;
+      status_ref.store(static_cast<uint32_t>(status::aggregate_available));
+    }
+  }
+
+  if(effective_group_id != 0) {
+    // my_element is a dummy value here; avoid relying on default constructor
+    // in case T has none
+    T exclusive_prefix = my_element;
+    if(local_id == 0) {
+      exclusive_prefix = exclusive_prefix_look_back(my_element, effective_group_id,
+                          scratch.group_status, scratch.group_aggregate,
+                          scratch.inclusive_prefix, op);
+    }
+    exclusive_prefix = collective_broadcast<T, BinaryOp>(
+        idx, exclusive_prefix, local_id, local_memory);
+    local_scan_result = op(exclusive_prefix, local_scan_result);
+  }
+
+  processor(idx, effective_group_id, global_id, problem_size, local_scan_result);
+}
+
+} // detail
+
+
+/// Implements the decoupled lookback scan algorithm -
+/// See Merill, Garland (2016) for details.
+///
+/// This algorithm assumes that the hardware can support acquire/release
+/// atomics.
+/// It also assumes that work groups with smaller ids are either scheduled
+/// before work groups with higher ids, or that work group execution may be
+/// preempted. To provide this guarantee universally, our implementation
+/// reassigns work group ids based on when they start executing.
+///
+/// \param gen A callable with signature \c T(nd_item<1>, uint32_t
+/// effective_group_id, size_t effective_global_id, size_t problem_size)
+///
+/// \c gen is the generator that generates the data elements to run the scan.
+/// Note that the scan implementation may reorder work-groups; \c gen should
+/// therefore not rely on the group id and global id from the provided nd_item,
+/// but instead use the provided \c effective_group_id and and \c
+/// effective_global_id.
+///
+/// If the problem size is not divisible by the selected work group size, then
+/// the last group might invoke \c gen with ids outside the bound. It is the
+/// responsibility of \c gen to handle this case. For these work items, the
+/// return value from \c gen can be an arbitrary dummy value (e.g. the last
+/// valid element within bounds).
+///
+/// \param processor A callable with signature \c void(nd_item<1>,  uint32_t
+/// effective_group_id, size_t effective_global_id, size_t problem_size, T
+/// result)
+///
+/// \c processor is invoked at the end of the scan with the result of the global
+/// scan for this particular work item. \c processor will be invoked once the
+/// global result for the work item is available which might be before the scan
+/// has completed for all work items. Do not assume global synchronization.
+///
+/// Note that the scan implementation may reorder work-groups; \c processor
+/// should therefore not rely on the group id and global id from the provided
+/// nd_item, but instead use the provided \c effective_group_id and and \c
+/// effective_global_id.
+///
+/// If the problem size is not divisible by the selected work group size, then
+/// the last group might invoke \c processor with ids outside the bound. It is
+/// the responsibility of \c processor to handle this case. For these work
+/// items, the result value passed into \c processor is undefined.
+template <bool IsInclusive, class T, class WorkItemDataGenerator, class ResultProcessor,
+          class BinaryOp, class OptionalInitT>
+sycl::event
+decoupled_lookback_scan(sycl::queue &q, util::allocation_group &scratch_alloc,
+                        WorkItemDataGenerator gen,
+                        ResultProcessor processor, BinaryOp op,
+                        std::size_t problem_size, std::size_t group_size,
+                        OptionalInitT init = std::nullopt,
+                        const std::vector<sycl::event> &user_deps = {}) {
+
+  if(problem_size == 0)
+    return sycl::event{};
+
+  static_assert(IsInclusive || std::is_convertible_v<OptionalInitT, T>,
+                "Non-inclusive scans need an init argument of same type as the "
+                "scan data element");
+  static_assert(
+      std::is_convertible_v<OptionalInitT, T> ||
+          std::is_same_v<OptionalInitT, std::nullopt_t>,
+      "Init argument must be of std::nullopt_t type or exact type of scan "
+      "data elements");
+
+  std::size_t num_groups = (problem_size + group_size - 1) / group_size;
+  detail::scratch_data<T> scratch{scratch_alloc, num_groups};
+  uint32_t* group_counter = scratch_alloc.obtain<unsigned>(1);
+
+  auto initialization_evt = q.parallel_for(num_groups, [=](sycl::id<1> idx){
+    scratch.group_status[idx] = detail::status::invalid;
+    if(idx.get(0) == 0) {
+      *group_counter = 0;
+    }
+  });
+
+  std::vector<sycl::event> deps = user_deps;
+  if(!q.is_in_order())
+    deps.push_back(initialization_evt);
+
+  sycl::nd_range<1> kernel_range{num_groups * group_size, group_size};
+  if constexpr(detail::can_use_group_algorithms<T, BinaryOp>()) {
+    return q.parallel_for(kernel_range, deps, [=](auto idx) {
+      detail::scan_kernel<IsInclusive>(idx, nullptr, scratch, group_counter, op,
+                                       init, problem_size, gen, processor);
+    });
+  } else {
+    // We need local memory:
+    // - 1 data element per work item
+    // - at least size for one uint32_t to broadcast group id
+    std::size_t local_mem_elements =
+        std::max(group_size, (sizeof(uint32_t) + sizeof(T) - 1) / sizeof(T));
+
+    // This is not entirely correct since max local mem size can also depend
+    // on work group size.
+    // We also assume that there is no other local memory consumer.
+    // TODO Improve this
+    std::size_t max_local_size =
+        q.get_device().get_info<sycl::info::device::local_mem_size>();
+
+    bool has_sufficient_local_memory = static_cast<double>(max_local_size) >=
+                                       1.5 * sizeof(T) * local_mem_elements;
+
+    if(has_sufficient_local_memory) {
+      return q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(deps);
+
+        sycl::local_accessor<T, 1> local_mem{local_mem_elements, cgh};
+        cgh.parallel_for(kernel_range, [=](auto idx) {
+          detail::scan_kernel<IsInclusive>(idx, &(local_mem[0]),
+                                           scratch, group_counter, op, init,
+                                           problem_size, gen, processor);
+        });
+      });
+    } else {
+      // This is a super inefficient dummy algorithm for now that requires
+      // large scratch storage
+      T* emulated_local_mem = scratch_alloc.obtain<T>(num_groups * local_mem_elements);
+
+      return q.parallel_for(kernel_range, deps, [=](auto idx) {
+        detail::scan_kernel<IsInclusive>(
+            idx,
+            emulated_local_mem + local_mem_elements * idx.get_group_linear_id(),
+            scratch, group_counter, op, init, problem_size, gen, processor);
+      });
+    }
+  }
+}
+}
+
+#endif
diff --git a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
index 2b8331ce4..ed17da13b 100644
--- a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
+++ b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
@@ -52,6 +52,8 @@ struct any_of {};
 struct none_of {};
 struct sort {};
 struct merge {};
+struct inclusive_scan {};
+struct exclusive_scan {};
 
 
 struct transform_reduce {};
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
index f8b3776dd..e10a7f97b 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
@@ -291,6 +291,181 @@ T reduce(hipsycl::stdpar::par_unseq, ForwardIt first,
 }
 
 
+
+// scans
+
+
+template <class InputIt, class OutputIt, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op);
+  
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first,
+                                                  BinaryOp op, T init) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op, init);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op, init);
+}
+
+template <class InputIt, class OutputIt>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                               last, d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt
+exclusive_scan(hipsycl::stdpar::par_unseq,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init, op);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, init, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, op);
+}
+
+template <class InputIt, class OutputIt, class T>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt exclusive_scan(hipsycl::stdpar::par_unseq,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init);
+}
+
+
 //////////////////// par policy /////////////////////////////////////
 
 
@@ -559,6 +734,178 @@ T reduce(hipsycl::stdpar::par, ForwardIt first,
       HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), init, binary_op);
 }
 
+// scans
+
+
+template <class InputIt, class OutputIt, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt inclusive_scan(hipsycl::stdpar::par,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op);
+  
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first,
+                                                  BinaryOp op, T init) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op, init);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op, init);
+}
+
+template <class InputIt, class OutputIt>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_host_fallback, first, last,
+                               d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt
+exclusive_scan(hipsycl::stdpar::par,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init, op);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, init, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, op);
+}
+
+template <class InputIt, class OutputIt, class T>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt exclusive_scan(hipsycl::stdpar::par,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt last = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(last, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init);
+    }
+    return last;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init);
+}
 
 
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b813c47c0..ac513d090 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -140,6 +140,7 @@ if(WITH_PSTL_TESTS)
     pstl/for_each_n.cpp
     pstl/generate.cpp
     pstl/generate_n.cpp
+    pstl/inclusive_scan.cpp
     pstl/memory.cpp
     pstl/merge.cpp
     pstl/none_of.cpp
diff --git a/tests/pstl/inclusive_scan.cpp b/tests/pstl/inclusive_scan.cpp
new file mode 100644
index 000000000..4672859aa
--- /dev/null
+++ b/tests/pstl/inclusive_scan.cpp
@@ -0,0 +1,201 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_inclusive_scan, enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+  std::array<uint64_t, (PaddingSize + sizeof(uint64_t) - 1) / sizeof(uint64_t)>
+      padding;
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < data.size(); ++i)
+    data.push_back(gen(i));
+
+  std::vector<T> reference0;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    reference0 = data;
+    std::inclusive_scan(data.begin(), data.end(), reference0.begin());
+  }
+
+  std::vector<T> reference1 = data;
+  std::inclusive_scan(data.begin(), data.end(), reference1.begin(), op);
+
+  std::vector<T> reference2 = data;
+  std::inclusive_scan(data.begin(), data.end(), reference1.begin(), op, init);
+
+  std::vector<T> device_result0 = data;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result0.begin()) ==
+                device_result0.end());
+  }
+
+  std::vector<T> device_result1 = data;
+    BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result1.begin(), op) ==
+                device_result1.end());
+  
+  std::vector<T> device_result2 = data;
+    BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result2.begin(), op, init) ==
+                device_result2.end());
+  
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(reference0 == device_result0);
+  }
+  BOOST_CHECK(reference1 == device_result1);
+  BOOST_CHECK(reference2 == device_result2);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 1024>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()

From 732575fd208179b7f2c8640d03660b924709eca1 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 25 Nov 2024 06:26:18 +0100
Subject: [PATCH 064/126] [stdpar] Add exclusive_scan tests

---
 tests/CMakeLists.txt          |   1 +
 tests/pstl/exclusive_scan.cpp | 193 ++++++++++++++++++++++++++++++++++
 2 files changed, 194 insertions(+)
 create mode 100644 tests/pstl/exclusive_scan.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ac513d090..fd4ac1393 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -134,6 +134,7 @@ if(WITH_PSTL_TESTS)
     pstl/copy.cpp
     pstl/copy_if.cpp
     pstl/copy_n.cpp
+    pstl/exclusive_scan.cpp
     pstl/fill.cpp
     pstl/fill_n.cpp
     pstl/for_each.cpp
diff --git a/tests/pstl/exclusive_scan.cpp b/tests/pstl/exclusive_scan.cpp
new file mode 100644
index 000000000..f6df7f5c8
--- /dev/null
+++ b/tests/pstl/exclusive_scan.cpp
@@ -0,0 +1,193 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_exclusive_scan, enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+  std::array<uint64_t, (PaddingSize + sizeof(uint64_t) - 1) / sizeof(uint64_t)>
+      padding;
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < data.size(); ++i)
+    data.push_back(gen(i));
+
+  std::vector<T> reference0;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    reference0 = data;
+    std::exclusive_scan(data.begin(), data.end(), reference0.begin(), init);
+  }
+
+  std::vector<T> reference1 = data;
+  std::exclusive_scan(data.begin(), data.end(), reference1.begin(), init, op);
+
+  std::vector<T> device_result0 = data;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(std::exclusive_scan(pol, data.begin(), data.end(),
+                                    device_result0.begin(), init) ==
+                device_result0.end());
+  }
+
+  std::vector<T> device_result1 = data;
+    BOOST_CHECK(std::exclusive_scan(pol, data.begin(), data.end(),
+                                    device_result1.begin(), init, op) ==
+                device_result1.end());
+  
+  
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(reference0 == device_result0);
+  }
+  BOOST_CHECK(reference1 == device_result1);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 1024>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()

From cdf7c03beb7f7e85f76454f2f51a034134a72d30 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 25 Nov 2024 07:50:33 +0100
Subject: [PATCH 065/126] Add needed atomic mappings for ptx

---
 .../generic/hiplike/atomic_builtins.hpp       | 43 +++++++++++++++++++
 src/libkernel/sscp/ptx/atomic.cpp             | 17 ++++++++
 2 files changed, 60 insertions(+)

diff --git a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
index 8becda6f5..fb72a1a89 100644
--- a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
+++ b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
@@ -17,6 +17,7 @@
 
 #include <type_traits>
 
+
 #if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA ||                                   \
     ACPP_LIBKERNEL_IS_DEVICE_PASS_HIP
 
@@ -40,10 +41,44 @@ inline constexpr int builtin_memory_order(memory_order o) noexcept {
   return __ATOMIC_RELAXED;
 }
 
+#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA
+
+// LLVM NVPTX backend does currently not properly support acquire/release
+// atomics. We workaround this for two load/store instructions that we
+// need for the algorithms library using inline assembly.
+__attribute__((always_inline)) 
+void __acpp_cuda_atomic_store_device_rel_i32(int32_t *ptr, int32_t x) {
+  asm volatile("st.release.gpu.s32 [%0], %1;"
+              :
+              :"l"(ptr), "r"(x)
+              : "memory");
+}
+
+__attribute__((always_inline)) 
+int32_t __acpp_cuda_atomic_load_device_acq_i32(int32_t *ptr) {
+  int32_t result;
+  asm volatile("ld.acquire.gpu.u32 %0,[%1];"
+                : "=r"(result)
+                : "l"(ptr)
+                : "memory");
+  return result;  
+}
+
+#endif
+
 template <access::address_space S, class T>
 HIPSYCL_HIPLIKE_BUILTIN void
 __acpp_atomic_store(T *addr, T x, memory_order order,
                        memory_scope scope) noexcept {
+  if constexpr(sizeof(T) == sizeof(int32_t)) {
+#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA
+    if(scope == memory_scope::device && order == memory_order::release){
+      __acpp_cuda_atomic_store_device_rel_i32(reinterpret_cast<int32_t*>(addr),
+                                              bit_cast<int32_t>(x));
+      return;
+    }
+#endif
+  }
   __atomic_store_n(addr, x, builtin_memory_order(order));
 }
 
@@ -66,6 +101,14 @@ __acpp_atomic_store(double *addr, double x, memory_order order,
 template <access::address_space S, class T>
 HIPSYCL_HIPLIKE_BUILTIN T __acpp_atomic_load(T *addr, memory_order order,
                                                 memory_scope scope) noexcept {
+  if constexpr(sizeof(T) == sizeof(int32_t)) {
+#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA
+    if(scope == memory_scope::device && order == memory_order::acquire){
+      return bit_cast<T>(__acpp_cuda_atomic_load_device_acq_i32(
+          reinterpret_cast<int32_t*>(addr)));
+    }
+#endif
+  }
   return __atomic_load_n(addr, builtin_memory_order(order));
 }
 
diff --git a/src/libkernel/sscp/ptx/atomic.cpp b/src/libkernel/sscp/ptx/atomic.cpp
index f8d1f627f..afd4830ac 100644
--- a/src/libkernel/sscp/ptx/atomic.cpp
+++ b/src/libkernel/sscp/ptx/atomic.cpp
@@ -463,6 +463,14 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i32(
                    : "memory");
       return;
     }
+  } else if(scope == __acpp_sscp_memory_scope::device) {
+    if(order == __acpp_sscp_memory_order::release) {
+      asm volatile("st.release.gpu.s32 [%0], %1;"
+                   :
+                   :"l"(ptr), "r"(x)
+                   : "memory");
+      return;
+    }
   }
 
   *ptr = x;
@@ -503,6 +511,15 @@ HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_load_i32(
                    : "memory");
       return result;
     }
+  } else if(scope == __acpp_sscp_memory_scope::device) {
+    if(order == __acpp_sscp_memory_order::acquire) {
+      __acpp_int32 result;
+      asm volatile("ld.acquire.gpu.u32 %0,[%1];"
+                   : "=r"(result)
+                   : "l"(ptr)
+                   : "memory");
+      return result;
+    } 
   }
 
   return *ptr;

From 24273e87d47cf79c59691fdf7a4af15eec44a2c1 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 25 Nov 2024 08:17:01 +0100
Subject: [PATCH 066/126] Improve fallback scan algorithm

---
 .../scan/decoupled_lookback_scan.hpp          | 56 ++++++++++++++-----
 1 file changed, 42 insertions(+), 14 deletions(-)

diff --git a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
index 4ae9a74a8..c148ca22d 100644
--- a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
+++ b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
@@ -45,6 +45,47 @@ struct scratch_data {
   status* group_status;
 };
 
+template <class T, class BinaryOp>
+T kogge_stone_scan(sycl::nd_item<1> idx, T my_element, BinaryOp op,
+                   T *local_mem) {
+  const int lid = idx.get_local_linear_id();
+  const int local_size = idx.get_local_range().size();
+  local_mem[lid] = my_element;
+
+  for (unsigned stride = 1; stride < local_size; stride <<= 1) {
+    sycl::group_barrier(idx.get_group());
+    T current = my_element;
+    if (lid >= stride) {
+        current = op(local_mem[lid - stride], local_mem[lid]);
+    }
+    sycl::group_barrier(idx.get_group());
+    
+    if (lid >= stride) {
+      local_mem[lid] = current;
+    }
+  }
+
+  return local_mem[lid];
+}
+
+template <class T, class BinaryOp>
+T sequential_scan(sycl::nd_item<1> idx, T my_element, BinaryOp op,
+                   T *local_mem) {
+  int lid = idx.get_local_linear_id();
+  local_mem[lid] = my_element;
+  sycl::group_barrier(idx.get_group());
+
+  if(lid == 0) {
+    T current = local_mem[0];
+    for(int i = 1; i < idx.get_local_range().size(); ++i) {
+      current = op(current, local_mem[i]);
+      local_mem[i] = current;
+    }
+  }
+  sycl::group_barrier(idx.get_group());
+  return local_mem[lid];
+}
+
 template<class T, class BinaryOp>
 constexpr bool can_use_group_algorithms() {
   // TODO
@@ -57,20 +98,7 @@ T collective_inclusive_group_scan(sycl::nd_item<1> idx, T my_element,
   if constexpr(can_use_group_algorithms<T, BinaryOp>()) {
     // TODO
   } else {
-    int lid = idx.get_local_linear_id();
-    local_mem[lid] = my_element;
-    sycl::group_barrier(idx.get_group());
-
-    // TODO Improve this
-    if(lid == 0) {
-      T current = local_mem[0];
-      for(int i = 1; i < idx.get_local_range().size(); ++i) {
-        current = op(current, local_mem[i]);
-        local_mem[i] = current;
-      }
-    }
-    sycl::group_barrier(idx.get_group());
-    return local_mem[lid];
+    return kogge_stone_scan<T, BinaryOp>(idx, my_element, op, local_mem);
   }
 }
 

From 9d298ac16eee1b297e6aa73afbe89cdc71d18400 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 25 Nov 2024 17:00:11 +0100
Subject: [PATCH 067/126] Add missing attributes

---
 .../sycl/libkernel/generic/hiplike/atomic_builtins.hpp        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
index fb72a1a89..a203fccce 100644
--- a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
+++ b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
@@ -47,6 +47,7 @@ inline constexpr int builtin_memory_order(memory_order o) noexcept {
 // atomics. We workaround this for two load/store instructions that we
 // need for the algorithms library using inline assembly.
 __attribute__((always_inline)) 
+HIPSYCL_HIPLIKE_BUILTIN
 void __acpp_cuda_atomic_store_device_rel_i32(int32_t *ptr, int32_t x) {
   asm volatile("st.release.gpu.s32 [%0], %1;"
               :
@@ -54,7 +55,8 @@ void __acpp_cuda_atomic_store_device_rel_i32(int32_t *ptr, int32_t x) {
               : "memory");
 }
 
-__attribute__((always_inline)) 
+__attribute__((always_inline))
+HIPSYCL_HIPLIKE_BUILTIN
 int32_t __acpp_cuda_atomic_load_device_acq_i32(int32_t *ptr) {
   int32_t result;
   asm volatile("ld.acquire.gpu.u32 %0,[%1];"

From 95cd1e59ce1b14b8e574a54e60cd0659a717d81b Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 25 Nov 2024 17:35:43 +0100
Subject: [PATCH 068/126] Add acquire/release emulation on older CUDA hardware
 that does not support them in PTX

---
 .../sycl/libkernel/generic/hiplike/atomic_builtins.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
index a203fccce..fd2572e87 100644
--- a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
+++ b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
@@ -49,21 +49,31 @@ inline constexpr int builtin_memory_order(memory_order o) noexcept {
 __attribute__((always_inline)) 
 HIPSYCL_HIPLIKE_BUILTIN
 void __acpp_cuda_atomic_store_device_rel_i32(int32_t *ptr, int32_t x) {
+#if __CUDA_ARCH__ < 700
+  *ptr = x;
+  __threadfence();
+#else
   asm volatile("st.release.gpu.s32 [%0], %1;"
               :
               :"l"(ptr), "r"(x)
               : "memory");
+#endif
 }
 
 __attribute__((always_inline))
 HIPSYCL_HIPLIKE_BUILTIN
 int32_t __acpp_cuda_atomic_load_device_acq_i32(int32_t *ptr) {
+#if __CUDA_ARCH__ < 700
+  __threadfence();
+  return *ptr;
+#else
   int32_t result;
   asm volatile("ld.acquire.gpu.u32 %0,[%1];"
                 : "=r"(result)
                 : "l"(ptr)
                 : "memory");
   return result;  
+#endif
 }
 
 #endif

From 0a63fc2ea56200ca9f6be8b5c4a2de78a7a997f5 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 25 Nov 2024 18:12:41 +0100
Subject: [PATCH 069/126] CI: Bump nvc++ target compute capability to 70

---
 .github/workflows/linux.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index edd18a837..38a87d8bf 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -160,5 +160,5 @@ jobs:
     - name: build tests
       run: |
         mkdir ${GITHUB_WORKSPACE}/build/tests-nvcxx
-        cmake -DACPP_TARGETS="cuda-nvcxx" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="-DHIPSYCL_NO_FIBERS" -DAdaptiveCpp_DIR=${GITHUB_WORKSPACE}/build/install/lib/cmake/AdaptiveCpp ${GITHUB_WORKSPACE}/tests
+        cmake -DACPP_TARGETS="cuda-nvcxx:cc70" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="-DHIPSYCL_NO_FIBERS" -DAdaptiveCpp_DIR=${GITHUB_WORKSPACE}/build/install/lib/cmake/AdaptiveCpp ${GITHUB_WORKSPACE}/tests
         make -j2

From 895ffba5e5b23f1e01cbdaa1d13993a2c4eb8185 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 26 Nov 2024 09:26:02 +0100
Subject: [PATCH 070/126] Fix bugs

---
 .../scan/decoupled_lookback_scan.hpp          | 44 ++++++++++----
 .../hipSYCL/std/stdpar/pstl-impl/numeric.hpp  | 60 +++++++++----------
 tests/pstl/exclusive_scan.cpp                 |  2 +-
 tests/pstl/inclusive_scan.cpp                 |  6 +-
 4 files changed, 65 insertions(+), 47 deletions(-)

diff --git a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
index c148ca22d..734c05bfc 100644
--- a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
+++ b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
@@ -56,7 +56,7 @@ T kogge_stone_scan(sycl::nd_item<1> idx, T my_element, BinaryOp op,
     sycl::group_barrier(idx.get_group());
     T current = my_element;
     if (lid >= stride) {
-        current = op(local_mem[lid - stride], local_mem[lid]);
+      current = op(local_mem[lid - stride], local_mem[lid]);
     }
     sycl::group_barrier(idx.get_group());
     
@@ -65,7 +65,9 @@ T kogge_stone_scan(sycl::nd_item<1> idx, T my_element, BinaryOp op,
     }
   }
 
-  return local_mem[lid];
+  auto result = local_mem[lid];
+  sycl::group_barrier(idx.get_group());
+  return result;
 }
 
 template <class T, class BinaryOp>
@@ -83,7 +85,9 @@ T sequential_scan(sycl::nd_item<1> idx, T my_element, BinaryOp op,
     }
   }
   sycl::group_barrier(idx.get_group());
-  return local_mem[lid];
+  auto result = local_mem[lid];
+  sycl::group_barrier(idx.get_group());
+  return result;
 }
 
 template<class T, class BinaryOp>
@@ -99,6 +103,7 @@ T collective_inclusive_group_scan(sycl::nd_item<1> idx, T my_element,
     // TODO
   } else {
     return kogge_stone_scan<T, BinaryOp>(idx, my_element, op, local_mem);
+    //return sequential_scan<T, BinaryOp>(idx, my_element, op, local_mem);
   }
 }
 
@@ -111,7 +116,9 @@ T collective_broadcast(sycl::nd_item<1> idx, T x, int local_id, T* local_mem) {
       *local_mem = x;
     }
     sycl::group_barrier(idx.get_group());
-    return *local_mem;
+    auto result = *local_mem;
+    sycl::group_barrier(idx.get_group());
+    return result;
   }
 }
 
@@ -133,7 +140,7 @@ T exclusive_prefix_look_back(const T &dummy_init, int effective_group_id,
     }
   };
 
-  for(int lookback_group = effective_group_id - 1; lookback_group > 0; --lookback_group) {
+  for(int lookback_group = effective_group_id - 1; lookback_group >= 0; --lookback_group) {
     uint32_t& status_ptr = reinterpret_cast<uint32_t&>(status[lookback_group]);
     sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
                   sycl::memory_scope::device,
@@ -219,19 +226,21 @@ void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
   T local_scan_result =
           collective_inclusive_group_scan(idx, my_element, op, local_memory);
 
-  if(local_id == local_size - 1) {
-    T group_aggregate =
-        IsInclusive ? local_scan_result : op(local_scan_result, my_element);
-
-    uint32_t *status_ptr =
+  uint32_t *status_ptr =
         reinterpret_cast<uint32_t *>(&scratch.group_status[effective_group_id]);
     sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
                   sycl::memory_scope::device,
                   sycl::access::address_space::global_space> status_ref{*status_ptr};
+
+  // Set group aggregate which we now know after scan. The first group
+  // Can also set its prefix and is done.
+  if(local_id == local_size - 1) {
+    T group_aggregate =
+        IsInclusive ? local_scan_result : op(local_scan_result, my_element);
     
     if(effective_group_id == 0) {
       scratch.group_aggregate[effective_group_id] = group_aggregate;
-      scratch.group_aggregate[effective_group_id] = group_aggregate;
+      scratch.inclusive_prefix[effective_group_id] = group_aggregate;
       status_ref.store(static_cast<uint32_t>(status::prefix_available));
     } else {
       scratch.group_aggregate[effective_group_id] = group_aggregate;
@@ -239,6 +248,7 @@ void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
     }
   }
 
+  // All groups except group 0 need to perform lookback to find their prefix
   if(effective_group_id != 0) {
     // my_element is a dummy value here; avoid relying on default constructor
     // in case T has none
@@ -249,8 +259,16 @@ void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
                           scratch.inclusive_prefix, op);
     }
     exclusive_prefix = collective_broadcast<T, BinaryOp>(
-        idx, exclusive_prefix, local_id, local_memory);
+        idx, exclusive_prefix, 0, local_memory);
     local_scan_result = op(exclusive_prefix, local_scan_result);
+
+    // All groups except first and last one need to update their prefix
+    if(effective_group_id != num_groups - 1) {
+      if(local_id == local_size - 1){
+        scratch.inclusive_prefix[effective_group_id] = local_scan_result;
+        status_ref.store(static_cast<uint32_t>(status::prefix_available));
+      }
+    }
   }
 
   processor(idx, effective_group_id, global_id, problem_size, local_scan_result);
@@ -326,7 +344,7 @@ decoupled_lookback_scan(sycl::queue &q, util::allocation_group &scratch_alloc,
 
   std::size_t num_groups = (problem_size + group_size - 1) / group_size;
   detail::scratch_data<T> scratch{scratch_alloc, num_groups};
-  uint32_t* group_counter = scratch_alloc.obtain<unsigned>(1);
+  uint32_t* group_counter = scratch_alloc.obtain<uint32_t>(1);
 
   auto initialization_evt = q.parallel_for(num_groups, [=](sycl::id<1> idx){
     scratch.group_status[idx] = detail::status::invalid;
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
index e10a7f97b..a65fdbe12 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
@@ -301,9 +301,9 @@ OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
                InputIt first, InputIt last, OutputIt d_first, BinaryOp op) {
 
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -312,7 +312,7 @@ OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
       hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
                                           d_first, op);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -336,9 +336,9 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
                                                   BinaryOp op, T init) {
   
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -347,7 +347,7 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
       hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
                                           d_first, op, init);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -369,9 +369,9 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
                                                   OutputIt d_first) {
   
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -380,7 +380,7 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
       hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
                                           d_first);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -404,9 +404,9 @@ exclusive_scan(hipsycl::stdpar::par_unseq,
                BinaryOp op) {
   
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -415,7 +415,7 @@ exclusive_scan(hipsycl::stdpar::par_unseq,
       hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
                                           d_first, init, op);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -438,9 +438,9 @@ OutputIt exclusive_scan(hipsycl::stdpar::par_unseq,
                            T init) {
 
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -449,7 +449,7 @@ OutputIt exclusive_scan(hipsycl::stdpar::par_unseq,
       hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
                                           d_first, init);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -743,9 +743,9 @@ OutputIt inclusive_scan(hipsycl::stdpar::par,
                InputIt first, InputIt last, OutputIt d_first, BinaryOp op) {
 
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -754,7 +754,7 @@ OutputIt inclusive_scan(hipsycl::stdpar::par,
       hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
                                           d_first, op);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -778,9 +778,9 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
                                                   BinaryOp op, T init) {
   
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -789,7 +789,7 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
       hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
                                           d_first, op, init);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -811,9 +811,9 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
                                                   OutputIt d_first) {
   
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -822,7 +822,7 @@ HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
       hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
                                           d_first);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -846,9 +846,9 @@ exclusive_scan(hipsycl::stdpar::par,
                BinaryOp op) {
   
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -857,7 +857,7 @@ exclusive_scan(hipsycl::stdpar::par,
       hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
                                           d_first, init, op);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
@@ -880,9 +880,9 @@ OutputIt exclusive_scan(hipsycl::stdpar::par,
                            T init) {
 
   auto offloader = [&](auto& queue){
-    OutputIt last = d_first;
+    OutputIt result = d_first;
     auto problem_size = std::distance(first, last);
-    std::advance(last, problem_size);
+    std::advance(result, problem_size);
     if(problem_size > 0) {
       auto scratch_group =
         hipsycl::stdpar::detail::stdpar_tls_runtime::get()
@@ -891,7 +891,7 @@ OutputIt exclusive_scan(hipsycl::stdpar::par,
       hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
                                           d_first, init);
     }
-    return last;
+    return result;
   };
 
   auto fallback = [&]() {
diff --git a/tests/pstl/exclusive_scan.cpp b/tests/pstl/exclusive_scan.cpp
index f6df7f5c8..0784a247a 100644
--- a/tests/pstl/exclusive_scan.cpp
+++ b/tests/pstl/exclusive_scan.cpp
@@ -72,7 +72,7 @@ bool operator!=(const non_default_constructible<T, PaddingSize> &a,
 template<class Policy, class Generator, class BinOp, class T>
 void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
   std::vector<T> data;
-  for(std::size_t i = 0; i < data.size(); ++i)
+  for(std::size_t i = 0; i < size; ++i)
     data.push_back(gen(i));
 
   std::vector<T> reference0;
diff --git a/tests/pstl/inclusive_scan.cpp b/tests/pstl/inclusive_scan.cpp
index 4672859aa..3c8ba3134 100644
--- a/tests/pstl/inclusive_scan.cpp
+++ b/tests/pstl/inclusive_scan.cpp
@@ -72,7 +72,7 @@ bool operator!=(const non_default_constructible<T, PaddingSize> &a,
 template<class Policy, class Generator, class BinOp, class T>
 void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
   std::vector<T> data;
-  for(std::size_t i = 0; i < data.size(); ++i)
+  for(std::size_t i = 0; i < size; ++i)
     data.push_back(gen(i));
 
   std::vector<T> reference0;
@@ -85,7 +85,7 @@ void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size
   std::inclusive_scan(data.begin(), data.end(), reference1.begin(), op);
 
   std::vector<T> reference2 = data;
-  std::inclusive_scan(data.begin(), data.end(), reference1.begin(), op, init);
+  std::inclusive_scan(data.begin(), data.end(), reference2.begin(), op, init);
 
   std::vector<T> device_result0 = data;
   if constexpr(std::is_same_v<BinOp, std::plus<>>) {
@@ -98,7 +98,7 @@ void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size
     BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
                                     device_result1.begin(), op) ==
                 device_result1.end());
-  
+
   std::vector<T> device_result2 = data;
     BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
                                     device_result2.begin(), op, init) ==

From b564c710bd6fe8e859d1eb9de4e70a130b5c74ae Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 26 Nov 2024 16:02:45 +0100
Subject: [PATCH 071/126] Fix incorrect group aggregate for exclusive_scan

---
 include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
index 734c05bfc..c38f48072 100644
--- a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
+++ b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
@@ -235,8 +235,7 @@ void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
   // Set group aggregate which we now know after scan. The first group
   // Can also set its prefix and is done.
   if(local_id == local_size - 1) {
-    T group_aggregate =
-        IsInclusive ? local_scan_result : op(local_scan_result, my_element);
+    T group_aggregate = local_scan_result;
     
     if(effective_group_id == 0) {
       scratch.group_aggregate[effective_group_id] = group_aggregate;

From 44ef36b6025da4716f92f026b9111cebd450c39a Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 26 Nov 2024 16:48:44 +0100
Subject: [PATCH 072/126] [SSCP][llvm-to-ptx] Don't use acquire release
 load/stores on pre-Volta hardware

---
 src/libkernel/sscp/ptx/atomic.cpp | 77 +++++++++++++++++--------------
 1 file changed, 43 insertions(+), 34 deletions(-)

diff --git a/src/libkernel/sscp/ptx/atomic.cpp b/src/libkernel/sscp/ptx/atomic.cpp
index afd4830ac..45b9c2dce 100644
--- a/src/libkernel/sscp/ptx/atomic.cpp
+++ b/src/libkernel/sscp/ptx/atomic.cpp
@@ -13,6 +13,8 @@
 #include "hipSYCL/sycl/libkernel/sscp/builtins/ptx/libdevice.hpp"
 
 
+extern "C" int __acpp_sscp_jit_reflect_target_arch();
+
 // Atomic definitions adapted from __clang_cuda_device_functions.h
 
 double __dAtomicAdd(double *__p, double __v) {
@@ -455,24 +457,25 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i16(
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr, __acpp_int32 x) {
-  if(scope == __acpp_sscp_memory_scope::system) {
-    if(order == __acpp_sscp_memory_order::release) {
-      asm volatile("st.release.sys.s32 [%0], %1;"
-                   :
-                   :"l"(ptr), "r"(x)
-                   : "memory");
-      return;
-    }
-  } else if(scope == __acpp_sscp_memory_scope::device) {
-    if(order == __acpp_sscp_memory_order::release) {
-      asm volatile("st.release.gpu.s32 [%0], %1;"
-                   :
-                   :"l"(ptr), "r"(x)
-                   : "memory");
-      return;
+  if(__acpp_sscp_jit_reflect_target_arch() >= 70) {
+    if(scope == __acpp_sscp_memory_scope::system) {
+      if(order == __acpp_sscp_memory_order::release) {
+        asm volatile("st.release.sys.s32 [%0], %1;"
+                    :
+                    :"l"(ptr), "r"(x)
+                    : "memory");
+        return;
+      }
+    } else if(scope == __acpp_sscp_memory_scope::device) {
+      if(order == __acpp_sscp_memory_order::release) {
+        asm volatile("st.release.gpu.s32 [%0], %1;"
+                    :
+                    :"l"(ptr), "r"(x)
+                    : "memory");
+        return;
+      }
     }
   }
-
   *ptr = x;
   mem_fence(scope);
 }
@@ -490,44 +493,50 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i64(
 HIPSYCL_SSCP_BUILTIN __acpp_int8 __acpp_sscp_atomic_load_i8(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int8 *ptr) {
+  mem_fence(scope);
   return *ptr;
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int16 __acpp_sscp_atomic_load_i16(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int16 *ptr) {
+  mem_fence(scope);
   return *ptr;
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_load_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr) {
-  if(scope == __acpp_sscp_memory_scope::system) {
-    if(order == __acpp_sscp_memory_order::acquire) {
-      __acpp_int32 result;
-      asm volatile("ld.acquire.sys.u32 %0,[%1];"
-                   : "=r"(result)
-                   : "l"(ptr)
-                   : "memory");
-      return result;
+
+  if(__acpp_sscp_jit_reflect_target_arch() >= 70) {
+    if(scope == __acpp_sscp_memory_scope::system) {
+      if(order == __acpp_sscp_memory_order::acquire) {
+        __acpp_int32 result;
+        asm volatile("ld.acquire.sys.u32 %0,[%1];"
+                    : "=r"(result)
+                    : "l"(ptr)
+                    : "memory");
+        return result;
+      }
+    } else if(scope == __acpp_sscp_memory_scope::device) {
+      if(order == __acpp_sscp_memory_order::acquire) {
+        __acpp_int32 result;
+        asm volatile("ld.acquire.gpu.u32 %0,[%1];"
+                    : "=r"(result)
+                    : "l"(ptr)
+                    : "memory");
+        return result;
+      } 
     }
-  } else if(scope == __acpp_sscp_memory_scope::device) {
-    if(order == __acpp_sscp_memory_order::acquire) {
-      __acpp_int32 result;
-      asm volatile("ld.acquire.gpu.u32 %0,[%1];"
-                   : "=r"(result)
-                   : "l"(ptr)
-                   : "memory");
-      return result;
-    } 
   }
-
+  mem_fence(scope);
   return *ptr;
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int64 __acpp_sscp_atomic_load_i64(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int64 *ptr) {
+  mem_fence(scope);
   return *ptr;
 }
 

From 54ba3f154a04ae38d788ca115451ea55070791d2 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 4 Dec 2024 06:07:53 +0100
Subject: [PATCH 073/126] [SSCP] Also expose reflection to builtin bitcode
 libraries

---
 doc/env_variables.md                           | 1 +
 src/compiler/llvm-to-backend/LLVMToBackend.cpp | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/doc/env_variables.md b/doc/env_variables.md
index 618bc553e..792a372c9 100644
--- a/doc/env_variables.md
+++ b/doc/env_variables.md
@@ -54,6 +54,7 @@ Available stages for dumping:
 * `ACPP_S2_IR_DUMP_REFLECTION` - After processing JIT-time reflection queries
 * `ACPP_S2_IR_DUMP_JIT_OPTIMIZATIONS` - After processing optimizations that rely on JIT-time information`
 * `ACPP_S2_IR_DUMP_BACKEND_FLAVORING` - After applying the "backend flavor", i.e. turning generic LLVM IR into IR that targets a specific backend.
+* `ACPP_S2_IR_DUMP_BUILTIN_REFLECTION` - After second run of JIT-time reflection pass; particularly affects reflection use inside AdaptiveCpp builtins.
 * `ACPP_S2_IR_DUMP_FULL_OPTIMIZATIONS` - After running the full LLVM optimization pipeline on the code.
 * `ACPP_S2_IR_DUMP_FINAL` - Final state of the LLVM IR before handing it off to lowering it to backend-specific formats (e.g. PTX, amdgcn ISA, SPIR-V).
 * `ACPP_S2_IR_DUMP_ALL` - Dump all stages.
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index 0a7f50635..3d1cd123f 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -415,6 +415,9 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     }
 
     enableModuleStateDumping(M, "backend_flavoring", getCompilationIdentifier());
+    // Run again to resolve reflection inside builtins
+    S2RP.run(M, MAM);
+    enableModuleStateDumping(M, "builtin_reflection", getCompilationIdentifier());
 
     // Inline again to handle builtin definitions pulled in by backend flavors
     InliningPass.run(M, MAM);

From 451c64d09357c94111edda8d87551da84812f8a8 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 4 Dec 2024 06:23:09 +0100
Subject: [PATCH 074/126] Don't use CUDA atomic workaround on nvc++

---
 .github/workflows/linux.yml                           |  2 +-
 .../libkernel/generic/hiplike/atomic_builtins.hpp     | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 38a87d8bf..edd18a837 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -160,5 +160,5 @@ jobs:
     - name: build tests
       run: |
         mkdir ${GITHUB_WORKSPACE}/build/tests-nvcxx
-        cmake -DACPP_TARGETS="cuda-nvcxx:cc70" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="-DHIPSYCL_NO_FIBERS" -DAdaptiveCpp_DIR=${GITHUB_WORKSPACE}/build/install/lib/cmake/AdaptiveCpp ${GITHUB_WORKSPACE}/tests
+        cmake -DACPP_TARGETS="cuda-nvcxx" -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="-DHIPSYCL_NO_FIBERS" -DAdaptiveCpp_DIR=${GITHUB_WORKSPACE}/build/install/lib/cmake/AdaptiveCpp ${GITHUB_WORKSPACE}/tests
         make -j2
diff --git a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
index fd2572e87..d5e3a0d56 100644
--- a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
+++ b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
@@ -41,8 +41,11 @@ inline constexpr int builtin_memory_order(memory_order o) noexcept {
   return __ATOMIC_RELAXED;
 }
 
-#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA
+#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA && !defined(ACPP_LIBKERNEL_CUDA_NVCXX)
+ #define ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
+#endif
 
+#ifdef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
 // LLVM NVPTX backend does currently not properly support acquire/release
 // atomics. We workaround this for two load/store instructions that we
 // need for the algorithms library using inline assembly.
@@ -83,7 +86,7 @@ HIPSYCL_HIPLIKE_BUILTIN void
 __acpp_atomic_store(T *addr, T x, memory_order order,
                        memory_scope scope) noexcept {
   if constexpr(sizeof(T) == sizeof(int32_t)) {
-#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA
+#ifdef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
     if(scope == memory_scope::device && order == memory_order::release){
       __acpp_cuda_atomic_store_device_rel_i32(reinterpret_cast<int32_t*>(addr),
                                               bit_cast<int32_t>(x));
@@ -114,7 +117,7 @@ template <access::address_space S, class T>
 HIPSYCL_HIPLIKE_BUILTIN T __acpp_atomic_load(T *addr, memory_order order,
                                                 memory_scope scope) noexcept {
   if constexpr(sizeof(T) == sizeof(int32_t)) {
-#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA
+#ifdef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
     if(scope == memory_scope::device && order == memory_order::acquire){
       return bit_cast<T>(__acpp_cuda_atomic_load_device_acq_i32(
           reinterpret_cast<int32_t*>(addr)));
@@ -548,6 +551,8 @@ __acpp_atomic_fetch_max(double *addr, double x, memory_order order,
 }
 }
 
+#undef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
+
 #endif
 
 #endif

From 7cb239b58066d10e76b655a380aa91f67533aef9 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 4 Dec 2024 05:50:38 +0100
Subject: [PATCH 075/126] [stdpar][algorithms] Add transform_in/exclusive_scan

---
 doc/stdpar.md                                 |   3 +-
 include/hipSYCL/algorithms/numeric.hpp        |  32 ++-
 include/hipSYCL/algorithms/scan/scan.hpp      | 135 +++++++++++
 .../stdpar/detail/offload_heuristic_db.hpp    |   4 +-
 .../hipSYCL/std/stdpar/pstl-impl/numeric.hpp  | 217 ++++++++++++++++++
 tests/CMakeLists.txt                          |   2 +
 tests/pstl/transform_exclusive_scan.cpp       | 184 +++++++++++++++
 tests/pstl/transform_inclusive_scan.cpp       | 194 ++++++++++++++++
 8 files changed, 767 insertions(+), 4 deletions(-)
 create mode 100644 include/hipSYCL/algorithms/scan/scan.hpp
 create mode 100644 tests/pstl/transform_exclusive_scan.cpp
 create mode 100644 tests/pstl/transform_inclusive_scan.cpp

diff --git a/doc/stdpar.md b/doc/stdpar.md
index 060a1c280..d5bf3f6ef 100644
--- a/doc/stdpar.md
+++ b/doc/stdpar.md
@@ -46,7 +46,8 @@ Offloading is implemented for the following STL algorithms:
 |`sort` | may not scale optimally for large problems |
 |`inclusive_scan` | |
 |`exclusive_scan` | |
-
+|`transform_inclusive_scan` | |
+|`transform_exclusive_scan` | |
 
 For all other execution policies or algorithms, the algorithm will compile and execute correctly, however the regular host implementation of the algorithm provided by the C++ standard library implementation will be invoked and no offloading takes place.
 
diff --git a/include/hipSYCL/algorithms/numeric.hpp b/include/hipSYCL/algorithms/numeric.hpp
index a2fb96611..526b7a0ff 100644
--- a/include/hipSYCL/algorithms/numeric.hpp
+++ b/include/hipSYCL/algorithms/numeric.hpp
@@ -23,7 +23,7 @@
 #include "hipSYCL/sycl/queue.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_descriptor.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_engine.hpp"
-#include "hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp"
+#include "hipSYCL/algorithms/scan/scan.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
 
 
@@ -361,6 +361,36 @@ sycl::event exclusive_scan(sycl::queue &q,
                         std::plus<>{}, deps);
 }
 
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    const std::vector<sycl::event> &deps = {}) {
+  return scanning::transform_scan<true>(q, scratch_allocations, first, last,
+                                        d_first, unary_op, binary_op,
+                                        std::nullopt, deps);
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp, class T>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    T init, const std::vector<sycl::event> &deps = {}) {
+  return scanning::transform_scan<true>(q, scratch_allocations, first, last,
+                                        d_first, unary_op, binary_op,
+                                        init, deps);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp, class UnaryOp>
+sycl::event transform_exclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, T init, BinaryOp binary_op,
+    UnaryOp unary_op, const std::vector<sycl::event> &deps = {}) {
+  return scanning::transform_scan<false>(q, scratch_allocations, first, last,
+                                         d_first, unary_op, binary_op, init,
+                                         deps);
+}
+
 } // algorithms
 
 
diff --git a/include/hipSYCL/algorithms/scan/scan.hpp b/include/hipSYCL/algorithms/scan/scan.hpp
new file mode 100644
index 000000000..4f4581be6
--- /dev/null
+++ b/include/hipSYCL/algorithms/scan/scan.hpp
@@ -0,0 +1,135 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_SCAN_HPP
+#define ACPP_ALGORITHMS_SCAN_HPP
+
+#include "hipSYCL/sycl/event.hpp"
+#include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
+
+#include "decoupled_lookback_scan.hpp"
+#include <type_traits>
+
+namespace hipsycl::algorithms::scanning {
+
+namespace detail {
+
+inline std::size_t select_scan_work_group_size(sycl::queue& q) {
+  std::size_t group_size = 128;
+  if(q.get_device().AdaptiveCpp_device_id().get_backend() == sycl::backend::omp) {
+    group_size = 1024;
+  }
+  return group_size;
+}
+
+}
+
+
+template <bool IsInclusive, class T, class BinaryOp,
+          class OptionalInitT, class Generator, class Processor>
+sycl::event generate_scan_process(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 std::size_t problem_size, BinaryOp op,
+                 OptionalInitT init, Generator gen, Processor processor,
+                 const std::vector<sycl::event> &deps = {}) {
+  
+  std::size_t group_size = detail::select_scan_work_group_size(q);
+
+  return scanning::decoupled_lookback_scan<IsInclusive, T>(
+      q, scratch_allocations, gen, processor, op, problem_size,
+      group_size, init, deps);
+}
+
+template <bool IsInclusive, class InputIt, class OutputIt, class BinaryOp,
+          class OptionalInitT>
+sycl::event scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+                 OptionalInitT init,
+                 const std::vector<sycl::event> &deps = {}) {
+
+  auto generator = [=](auto idx, auto effective_group_id, auto effective_global_id,
+                 auto problem_size) {
+    if(effective_global_id >= problem_size)
+      effective_global_id = problem_size - 1;
+
+    InputIt it = first;
+    std::advance(it, effective_global_id);
+    return *it;
+  };
+  auto result_processor = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size,
+                       auto value) {
+    if (effective_global_id < problem_size) {
+      OutputIt it = d_first;
+      std::advance(it, effective_global_id);
+      *it = value;
+    }
+  };
+
+  std::size_t problem_size = std::distance(first, last);
+  using T = std::decay_t<decltype(*first)>;
+
+  return generate_scan_process<IsInclusive, T>(
+      q, scratch_allocations, problem_size, op, init, generator,
+      result_processor, deps);
+}
+
+template <bool IsInclusive, class InputIt, class OutputIt, class UnaryOp,
+          class BinaryOp, class OptionalInitT>
+sycl::event transform_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           UnaryOp unary_op, BinaryOp op, OptionalInitT init,
+                           const std::vector<sycl::event> &deps = {}) {
+
+  using T = std::decay_t<decltype(unary_op(*first))>;
+
+  auto generator = [=](auto idx, auto effective_group_id, auto effective_global_id,
+                 auto problem_size) {
+    if(effective_global_id >= problem_size) {
+      if constexpr(std::is_constructible_v<T>) {
+        return T{};
+      } else {
+        // This might be invalid according to a very strict implementation of C++
+        // definition of e.g. transform_reduce, since it does not guarantee that
+        // unary_op is executed exactly once per element.
+        // However, working around this might be fairly costly in case T is not
+        // default constructible (Idea: Global variable guarded by an atomic lock
+        // which is set by the first thread to have loaded a value), so for
+        // now we do "the simple thing". This is probably still better than
+        // not offloading in that case.
+        return unary_op(*first);
+      }
+    }
+
+    InputIt it = first;
+    std::advance(it, effective_global_id);
+    return unary_op(*it);
+  };
+  auto result_processor = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size,
+                       auto value) {
+    if (effective_global_id < problem_size) {
+      OutputIt it = d_first;
+      std::advance(it, effective_global_id);
+      *it = value;
+    }
+  };
+
+  std::size_t problem_size = std::distance(first, last);
+  
+  return generate_scan_process<IsInclusive, T>(
+      q, scratch_allocations, problem_size, op, init, generator,
+      result_processor, deps);
+}
+}
+
+#endif
diff --git a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
index ed17da13b..c4e2809b7 100644
--- a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
+++ b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
@@ -54,8 +54,8 @@ struct sort {};
 struct merge {};
 struct inclusive_scan {};
 struct exclusive_scan {};
-
-
+struct transform_inclusive_scan {};
+struct transform_exclusive_scan {};
 struct transform_reduce {};
 struct reduce {};
 } // namespace algorithm_type
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
index a65fdbe12..2f3b8d4d7 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
@@ -466,6 +466,115 @@ OutputIt exclusive_scan(hipsycl::stdpar::par_unseq,
 }
 
 
+
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par_unseq, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(
+          queue, scratch_group, first, last, d_first, binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op);
+}
+
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp,
+          class T>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par_unseq, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op, T init) {
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, binary_op,
+                                                    unary_op, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op,
+      init);
+}
+
+template <class ForwardIt1, class ForwardIt2, class T, class BinaryOp,
+          class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt2 transform_exclusive_scan(hipsycl::stdpar::par_unseq, ForwardIt1 first,
+                                    ForwardIt1 last, ForwardIt2 d_first, T init,
+                                    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_exclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, init,
+                                                    binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_exclusive_scan(hipsycl::stdpar::par_unseq_host_fallback,
+                                         first, last, d_first, init, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_exclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, binary_op,
+      unary_op);
+}
+
 //////////////////// par policy /////////////////////////////////////
 
 
@@ -907,6 +1016,114 @@ OutputIt exclusive_scan(hipsycl::stdpar::par,
       HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init);
 }
 
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(
+          queue, scratch_group, first, last, d_first, binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op);
+}
+
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp,
+          class T>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op, T init) {
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, binary_op,
+                                                    unary_op, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op,
+      init);
+}
+
+template <class ForwardIt1, class ForwardIt2, class T, class BinaryOp,
+          class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt2 transform_exclusive_scan(hipsycl::stdpar::par, ForwardIt1 first,
+                                    ForwardIt1 last, ForwardIt2 d_first, T init,
+                                    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_exclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, init,
+                                                    binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_exclusive_scan(hipsycl::stdpar::par_host_fallback,
+                                         first, last, d_first, init, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_exclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, binary_op,
+      unary_op);
+}
+
 
 }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index fd4ac1393..0e2c0375b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -153,6 +153,8 @@ if(WITH_PSTL_TESTS)
     pstl/sort.cpp
     pstl/transform.cpp
     pstl/transform_reduce.cpp
+    pstl/transform_inclusive_scan.cpp
+    pstl/transform_exclusive_scan.cpp
     pstl/pointer_validation.cpp
     pstl/allocation_map.cpp
     pstl/free_space_map.cpp)
diff --git a/tests/pstl/transform_exclusive_scan.cpp b/tests/pstl/transform_exclusive_scan.cpp
new file mode 100644
index 000000000..8ccd971dd
--- /dev/null
+++ b/tests/pstl/transform_exclusive_scan.cpp
@@ -0,0 +1,184 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_transform_exclusive_scan,
+                         enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
+    return t;
+  }
+
+  T get() const {
+    return data[0];
+  }
+private:
+  non_default_constructible(){}
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < size; ++i)
+    data.push_back(gen(i));
+
+    
+  auto unary_op = [=](auto x){
+    return op(x, x);
+  };
+
+  std::vector<T> reference = data;
+  std::transform_exclusive_scan(data.begin(), data.end(), reference.begin(),
+                                init, op, unary_op);
+
+
+  std::vector<T> device_result = data;
+  BOOST_CHECK(std::transform_exclusive_scan(pol, data.begin(), data.end(),
+                                  device_result.begin(), init, op, unary_op) ==
+              device_result.end());
+
+  
+  BOOST_CHECK(reference == device_result);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  /*using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 4>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);*/
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/pstl/transform_inclusive_scan.cpp b/tests/pstl/transform_inclusive_scan.cpp
new file mode 100644
index 000000000..7c6e2a27a
--- /dev/null
+++ b/tests/pstl/transform_inclusive_scan.cpp
@@ -0,0 +1,194 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_transform_inclusive_scan,
+                         enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
+    return t;
+  }
+
+  T get() const {
+    return data[0];
+  }
+private:
+  non_default_constructible(){}
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < size; ++i)
+    data.push_back(gen(i));
+
+    
+  auto unary_op = [=](auto x){
+    return op(x, x);
+  };
+
+  std::vector<T> reference0 = data;
+  std::transform_inclusive_scan(data.begin(), data.end(), reference0.begin(),
+                                op, unary_op);
+
+  std::vector<T> reference1 = data;
+  std::transform_inclusive_scan(data.begin(), data.end(), reference1.begin(),
+                                op, unary_op, init);
+
+  std::vector<T> device_result0 = data;
+
+  BOOST_CHECK(std::transform_inclusive_scan(pol, data.begin(), data.end(),
+                                  device_result0.begin(), op, unary_op) ==
+              device_result0.end());
+
+  std::vector<T> device_result1 = data;
+    BOOST_CHECK(std::transform_inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result1.begin(), op, unary_op, init) ==
+                device_result1.end());
+  
+  
+  BOOST_CHECK(reference0 == device_result0);
+  BOOST_CHECK(reference1 == device_result1);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  /*using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 4>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);*/
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()

From 65a7d75660064c283e023a0818aa86cdd77c4b51 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 2 Dec 2024 02:44:41 +0100
Subject: [PATCH 076/126] Some performance improvements

---
 include/hipSYCL/algorithms/numeric.hpp        |   3 +
 .../scan/decoupled_lookback_scan.hpp          | 330 +++++++++++++++---
 src/runtime/omp/omp_hardware_manager.cpp      |   2 +
 3 files changed, 289 insertions(+), 46 deletions(-)

diff --git a/include/hipSYCL/algorithms/numeric.hpp b/include/hipSYCL/algorithms/numeric.hpp
index 526b7a0ff..b565d15e9 100644
--- a/include/hipSYCL/algorithms/numeric.hpp
+++ b/include/hipSYCL/algorithms/numeric.hpp
@@ -306,6 +306,9 @@ sycl::event scan(sycl::queue &q, util::allocation_group &scratch_allocations,
 
   std::size_t problem_size = std::distance(first, last);
   std::size_t group_size = 128;
+  if(q.get_device().AdaptiveCpp_device_id().get_backend() == sycl::backend::omp) {
+    group_size = 1024;
+  }
 
   using T = std::decay_t<decltype(*first)>;
   return scanning::decoupled_lookback_scan<IsInclusive, T>(
diff --git a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
index c38f48072..a62bc7539 100644
--- a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
+++ b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
@@ -122,6 +122,91 @@ T collective_broadcast(sycl::nd_item<1> idx, T x, int local_id, T* local_mem) {
   }
 }
 
+<<<<<<< HEAD
+=======
+template <int WorkPerItem, class T, class BinaryOp, class Generator,
+          class Processor, class PrefixHandler>
+void iterate_host_and_inclusive_group_scan(
+    sycl::nd_item<1> idx, BinaryOp op, T *local_mem, std::size_t global_group_id,
+    Generator gen, Processor result_processor,
+    PrefixHandler local_prefix_to_global_prefix) {
+  
+  const int lid = idx.get_local_linear_id();
+  const int group_size = idx.get_local_range().size();
+
+  const int num_elements = group_size * WorkPerItem;
+  if(lid == 0) {
+    T current_inclusive_scan;
+    for(int i = 0; i < num_elements; ++i) {
+      T current_element = gen(idx, i % WorkPerItem, i);
+      if(i == 0)
+        current_inclusive_scan = current_element;
+      else
+        current_inclusive_scan = op(current_inclusive_scan, current_element);
+      // we store the result array at i+1 to avoid conflicts with the
+      // fallback group broadcast, which uses element 0.
+      local_mem[i+1] = current_inclusive_scan;
+    }
+  }
+  sycl::group_barrier(idx.get_group());
+  T global_prefix = local_prefix_to_global_prefix(
+      // Index is not -1 because we store the array at offset 1.
+      lid, local_mem[group_size * WorkPerItem]);
+  sycl::group_barrier(idx.get_group());
+  if(global_group_id != 0 && lid == 0) {
+    for(int i = 1; i <= num_elements; ++i) {
+      local_mem[i] = op(global_prefix, local_mem[i]);
+    }
+  }
+  sycl::group_barrier(idx.get_group());
+  
+  for(int i = 0; i < WorkPerItem; ++i) {
+    int effective_id = lid * WorkPerItem + i;
+    result_processor(i, effective_id, local_mem[effective_id+1]);
+  }
+}
+
+template <int WorkPerItem, class T, class BinaryOp, class Generator,
+          class Processor, class PrefixHandler>
+void iterate_and_inclusive_group_scan(
+    sycl::nd_item<1> idx, BinaryOp op, T *local_mem, std::size_t global_group_id,
+    Generator gen, Processor result_processor,
+    PrefixHandler local_prefix_to_global_prefix) {
+
+  const int lid = idx.get_local_linear_id();
+  const int group_size = idx.get_local_range().size();
+  
+
+  T current_exclusive_prefix;
+  T scan_result [WorkPerItem];
+  for(int invocation = 0; invocation < WorkPerItem; ++invocation) {
+    int current_id = invocation * group_size + lid;
+    T my_element = gen(idx, invocation, current_id);
+    T local_scan_result =
+        collective_inclusive_group_scan(idx, my_element, op, local_mem);
+    
+    if(invocation != 0)
+      local_scan_result = op(current_exclusive_prefix, local_scan_result);
+    
+    current_exclusive_prefix = collective_broadcast<T, BinaryOp>(
+        idx, local_scan_result, group_size - 1, local_mem);
+    
+    scan_result[invocation] = local_scan_result;
+  }
+  // has local prefix here, this also does lookback
+  T global_prefix = local_prefix_to_global_prefix(lid, current_exclusive_prefix);
+
+  if(global_group_id != 0) {
+    for(int i = 0; i < WorkPerItem; ++i) {
+      scan_result[i] =  op(global_prefix, scan_result[i]);
+    }  
+  }
+  for(int i = 0; i < WorkPerItem; ++i) {
+    result_processor(i, i*group_size+lid, scan_result[i]);
+  }
+}
+
+>>>>>>> 22982201 (Some performance improvements)
 template <class T, class BinaryOp>
 T exclusive_prefix_look_back(const T &dummy_init, int effective_group_id,
                              detail::status *status, T *group_aggregate,
@@ -184,10 +269,11 @@ T load_data_element(Generator &&gen, sycl::nd_item<1> idx, BinaryOp op,
 
 template <bool IsInclusive, class T, class OptionalInitT, class BinaryOp,
           class Generator, class Processor>
-void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
-                 uint32_t *group_counter, BinaryOp op, OptionalInitT init,
-                 std::size_t problem_size, Generator &&gen,
-                 Processor &&processor) {
+void flat_group_scan_kernel(sycl::nd_item<1> idx, T *local_memory,
+                            scratch_data<T> scratch, uint32_t *group_counter,
+                            BinaryOp op, OptionalInitT init,
+                            std::size_t problem_size, Generator &&gen,
+                            Processor &&processor) {
   sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
                     sycl::memory_scope::device,
                     sycl::access::address_space::global_space>
@@ -273,6 +359,142 @@ void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
   processor(idx, effective_group_id, global_id, problem_size, local_scan_result);
 }
 
+template <int WorkPerItem, bool IsInclusive, class T, class OptionalInitT,
+          class BinaryOp, class Generator, class Processor>
+void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
+                 uint32_t *group_counter, BinaryOp op, OptionalInitT init,
+                 std::size_t problem_size, Generator &&data_generator,
+                 Processor &&processor) {
+  sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                    sycl::memory_scope::device,
+                    sycl::access::address_space::global_space>
+      group_id_counter{*group_counter};
+
+  const int local_id = idx.get_local_linear_id();
+  const int local_size = idx.get_local_range().size();
+  uint32_t effective_group_id = idx.get_group_linear_id();
+  if(local_id == 0) {
+    effective_group_id = group_id_counter.fetch_add(static_cast<uint32_t>(1));
+  }
+  effective_group_id = collective_broadcast<uint32_t, BinaryOp>(
+      idx, effective_group_id, 0, reinterpret_cast<uint32_t*>(local_memory));
+  
+
+  auto generator = [=](sycl::nd_item<1> idx, int invocation, int current_local_id) {
+    // This invokes gen for the current work item to obtain our data element
+    // for the scan. If we are dealing with an exclusive scan, load_data_element
+    // shifts the data access by 1, thus allowing us to treat the scan as inclusive
+    // in the subsequent algorithm.
+    // It also applies init to the first data element, if provided.
+    std::size_t global_id =
+        effective_group_id * local_size * WorkPerItem + current_local_id;
+
+    return load_data_element<IsInclusive, T>(
+      data_generator, idx, op, effective_group_id, global_id, problem_size, init);
+  };
+
+  auto local_prefix_to_global_prefix = [=](int local_id,
+                                           const T &local_inclusive_prefix) {
+    uint32_t *status_ptr =
+        reinterpret_cast<uint32_t *>(&scratch.group_status[effective_group_id]);
+    sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space>
+        status_ref{*status_ptr};
+
+    // Set group aggregate which we now know after scan. The first group
+    // Can also set its prefix and is done.
+    if (local_id == 0) {
+      if (effective_group_id == 0) {
+        scratch.group_aggregate[effective_group_id] = local_inclusive_prefix;
+        scratch.inclusive_prefix[effective_group_id] = local_inclusive_prefix;
+        status_ref.store(static_cast<uint32_t>(status::prefix_available));
+      } else {
+        scratch.group_aggregate[effective_group_id] = local_inclusive_prefix;
+        status_ref.store(static_cast<uint32_t>(status::aggregate_available));
+      }
+    }
+
+    sycl::group_barrier(idx.get_group());
+
+    // All groups except group 0 need to perform lookback to find their prefix
+    T exclusive_prefix;
+    if(effective_group_id != 0) {
+      if(local_id == 0) {
+        exclusive_prefix = exclusive_prefix_look_back(
+            exclusive_prefix, effective_group_id, scratch.group_status,
+            scratch.group_aggregate, scratch.inclusive_prefix, op);
+      }
+      exclusive_prefix = collective_broadcast<T, BinaryOp>(
+          idx, exclusive_prefix, 0, local_memory);
+
+      // All groups except first need to update their prefix
+      if(local_id == local_size - 1){
+        scratch.inclusive_prefix[effective_group_id] =
+            op(exclusive_prefix, local_inclusive_prefix);
+        status_ref.store(static_cast<uint32_t>(status::prefix_available));
+      }
+    }
+    return exclusive_prefix;
+  };
+
+  auto result_processor = [=](int invocation_id, int current_local_id,
+                              T scan_result) {
+    std::size_t global_id =
+        effective_group_id * local_size * WorkPerItem + current_local_id;
+    processor(idx, effective_group_id, global_id, problem_size, scan_result);
+  };
+
+  __acpp_if_target_sscp(
+      namespace jit = sycl::AdaptiveCpp_jit;
+      if (jit::reflect<jit::reflection_query::compiler_backend>() ==
+          jit::compiler_backend::host) {
+        iterate_host_and_inclusive_group_scan<WorkPerItem>(
+            idx, op, local_memory, effective_group_id, generator,
+            result_processor, local_prefix_to_global_prefix);
+        return;
+      });
+  __acpp_if_target_host(
+    iterate_host_and_inclusive_group_scan<WorkPerItem>(
+            idx, op, local_memory, effective_group_id, generator,
+            result_processor, local_prefix_to_global_prefix);
+        return;
+  );
+  // Only executed for non-host
+  iterate_and_inclusive_group_scan<WorkPerItem>(
+      idx, op, local_memory, effective_group_id, generator, result_processor,
+      local_prefix_to_global_prefix);
+
+}
+
+template<class T>
+constexpr int work_per_item() {
+  if constexpr(!std::is_constructible_v<T>)
+    return 1;
+  else {
+    return 16;
+  }
+}
+
+template <bool IsInclusive, class T, class OptionalInitT,
+          class BinaryOp, class Generator, class Processor>
+void select_and_run_scan_kernel(sycl::nd_item<1> idx,
+                                T *local_memory, scratch_data<T> scratch,
+                                uint32_t *group_counter, BinaryOp op,
+                                OptionalInitT init, std::size_t problem_size,
+                                Generator &&data_generator,
+                                Processor &&processor) {
+  if constexpr (!std::is_constructible_v<T>) {
+    flat_group_scan_kernel<IsInclusive>(idx, local_memory, scratch,
+                                        group_counter, op, init, problem_size,
+                                        data_generator, processor);
+  } else {
+    scan_kernel<work_per_item<T>(), IsInclusive>(
+        idx, local_memory, scratch, group_counter, op, init, problem_size,
+        data_generator, processor);
+  }
+}
+
 } // detail
 
 
@@ -341,7 +563,10 @@ decoupled_lookback_scan(sycl::queue &q, util::allocation_group &scratch_alloc,
       "Init argument must be of std::nullopt_t type or exact type of scan "
       "data elements");
 
-  std::size_t num_groups = (problem_size + group_size - 1) / group_size;
+  std::size_t num_items = (problem_size + detail::work_per_item<T>() - 1) /
+                          detail::work_per_item<T>();
+  std::size_t num_groups = (num_items + group_size - 1) / group_size;
+
   detail::scratch_data<T> scratch{scratch_alloc, num_groups};
   uint32_t* group_counter = scratch_alloc.obtain<uint32_t>(1);
 
@@ -356,53 +581,66 @@ decoupled_lookback_scan(sycl::queue &q, util::allocation_group &scratch_alloc,
   if(!q.is_in_order())
     deps.push_back(initialization_evt);
 
+  bool is_host = q.get_device().get_backend() == sycl::backend::omp;
+
   sycl::nd_range<1> kernel_range{num_groups * group_size, group_size};
   if constexpr(detail::can_use_group_algorithms<T, BinaryOp>()) {
-    return q.parallel_for(kernel_range, deps, [=](auto idx) {
-      detail::scan_kernel<IsInclusive>(idx, nullptr, scratch, group_counter, op,
-                                       init, problem_size, gen, processor);
-    });
-  } else {
-    // We need local memory:
-    // - 1 data element per work item
-    // - at least size for one uint32_t to broadcast group id
-    std::size_t local_mem_elements =
-        std::max(group_size, (sizeof(uint32_t) + sizeof(T) - 1) / sizeof(T));
-
-    // This is not entirely correct since max local mem size can also depend
-    // on work group size.
-    // We also assume that there is no other local memory consumer.
-    // TODO Improve this
-    std::size_t max_local_size =
-        q.get_device().get_info<sycl::info::device::local_mem_size>();
-
-    bool has_sufficient_local_memory = static_cast<double>(max_local_size) >=
-                                       1.5 * sizeof(T) * local_mem_elements;
-
-    if(has_sufficient_local_memory) {
-      return q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(deps);
-
-        sycl::local_accessor<T, 1> local_mem{local_mem_elements, cgh};
-        cgh.parallel_for(kernel_range, [=](auto idx) {
-          detail::scan_kernel<IsInclusive>(idx, &(local_mem[0]),
-                                           scratch, group_counter, op, init,
-                                           problem_size, gen, processor);
-        });
-      });
-    } else {
-      // This is a super inefficient dummy algorithm for now that requires
-      // large scratch storage
-      T* emulated_local_mem = scratch_alloc.obtain<T>(num_groups * local_mem_elements);
-
+    if(!is_host) {
       return q.parallel_for(kernel_range, deps, [=](auto idx) {
-        detail::scan_kernel<IsInclusive>(
-            idx,
-            emulated_local_mem + local_mem_elements * idx.get_group_linear_id(),
-            scratch, group_counter, op, init, problem_size, gen, processor);
+        detail::select_and_run_scan_kernel<IsInclusive>(
+            idx, static_cast<T *>(nullptr), scratch,
+            group_counter, op, init, problem_size, gen, processor);
       });
     }
   }
+  
+  // We need local memory:
+  // - 1 data element per work item
+  // - at least size for one uint32_t to broadcast group id
+  std::size_t local_mem_elements =
+      std::max(group_size, (sizeof(uint32_t) + sizeof(T) - 1) / sizeof(T));
+  if(is_host) {
+    // host also needs one element per every processed element
+    local_mem_elements *= detail::work_per_item<T>();
+    // ... in addition to broadcast!
+    ++local_mem_elements;
+  }
+
+  // This is not entirely correct since max local mem size can also depend
+  // on work group size.
+  // We also assume that there is no other local memory consumer.
+  // TODO Improve this
+  std::size_t max_local_size =
+      q.get_device().get_info<sycl::info::device::local_mem_size>();
+
+  
+  bool has_sufficient_local_memory =
+      is_host || static_cast<double>(max_local_size) >=
+                      1.5 * sizeof(T) * local_mem_elements;
+
+  if(has_sufficient_local_memory) {
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(deps);
+
+      sycl::local_accessor<T, 1> local_mem{local_mem_elements, cgh};
+      cgh.parallel_for(kernel_range, [=](auto idx) {
+        detail::select_and_run_scan_kernel<IsInclusive>(
+            idx, &(local_mem[0]), scratch, group_counter,
+            op, init, problem_size, gen, processor);
+      });
+    });
+  } else {
+    // This is a super inefficient dummy algorithm for now that requires
+    // large scratch storage
+    T* emulated_local_mem = scratch_alloc.obtain<T>(num_groups * local_mem_elements);
+
+    return q.parallel_for(kernel_range, deps, [=](auto idx) {
+      detail::select_and_run_scan_kernel<IsInclusive>(
+          idx,
+          emulated_local_mem + local_mem_elements * idx.get_group_linear_id(),
+          scratch, group_counter, op, init, problem_size, gen, processor);
+    });
+  }
 }
 }
 
diff --git a/src/runtime/omp/omp_hardware_manager.cpp b/src/runtime/omp/omp_hardware_manager.cpp
index d36aad6c7..ebaa007b8 100644
--- a/src/runtime/omp/omp_hardware_manager.cpp
+++ b/src/runtime/omp/omp_hardware_manager.cpp
@@ -122,6 +122,8 @@ std::size_t
 omp_hardware_context::get_property(device_uint_property prop) const {
   switch (prop) {
   case device_uint_property::max_compute_units:
+    // Do not change this; heuristics in algorithms library
+    // use this.
     return omp_get_num_procs();
     break;
   case device_uint_property::max_global_size0:

From bb20795b55736e5e5de07738750a3d8c7a5a018d Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 27 Nov 2024 06:15:14 +0100
Subject: [PATCH 077/126] Improve performance on CPU

---
 .../scan/decoupled_lookback_scan.hpp          | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
index a62bc7539..f81fd0399 100644
--- a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
+++ b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
@@ -20,6 +20,7 @@
 #include "hipSYCL/sycl/queue.hpp"
 #include "hipSYCL/sycl/libkernel/atomic_ref.hpp"
 #include "hipSYCL/sycl/libkernel/group_functions.hpp"
+#include "hipSYCL/sycl/jit.hpp"
 #include "hipSYCL/algorithms/util/allocation_cache.hpp"
 
 namespace hipsycl::algorithms::scanning {
@@ -102,8 +103,19 @@ T collective_inclusive_group_scan(sycl::nd_item<1> idx, T my_element,
   if constexpr(can_use_group_algorithms<T, BinaryOp>()) {
     // TODO
   } else {
+    namespace jit = sycl::AdaptiveCpp_jit;
+    __acpp_if_target_sscp(
+      // For some reason, using the compile_if_else wrapper introduces
+      // overheads for host JIT in this case :(
+      // This seems to be unique to this particular case here though.
+      if(jit::reflect<jit::reflection_query::compiler_backend>() ==
+              jit::compiler_backend::host) {
+        return sequential_scan<T, BinaryOp>(idx, my_element, op, local_mem);
+      } else {
+        return kogge_stone_scan<T, BinaryOp>(idx, my_element, op, local_mem);
+      }
+    );
     return kogge_stone_scan<T, BinaryOp>(idx, my_element, op, local_mem);
-    //return sequential_scan<T, BinaryOp>(idx, my_element, op, local_mem);
   }
 }
 
@@ -122,8 +134,6 @@ T collective_broadcast(sycl::nd_item<1> idx, T x, int local_id, T* local_mem) {
   }
 }
 
-<<<<<<< HEAD
-=======
 template <int WorkPerItem, class T, class BinaryOp, class Generator,
           class Processor, class PrefixHandler>
 void iterate_host_and_inclusive_group_scan(
@@ -206,7 +216,6 @@ void iterate_and_inclusive_group_scan(
   }
 }
 
->>>>>>> 22982201 (Some performance improvements)
 template <class T, class BinaryOp>
 T exclusive_prefix_look_back(const T &dummy_init, int effective_group_id,
                              detail::status *status, T *group_aggregate,
@@ -333,6 +342,8 @@ void flat_group_scan_kernel(sycl::nd_item<1> idx, T *local_memory,
     }
   }
 
+  sycl::group_barrier(idx.get_group());
+
   // All groups except group 0 need to perform lookback to find their prefix
   if(effective_group_id != 0) {
     // my_element is a dummy value here; avoid relying on default constructor

From 75c9bdf8ab68c35284f52272ecda831598302c5b Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Mon, 2 Dec 2024 06:53:48 +0100
Subject: [PATCH 078/126] [stdpar] Properly implement copy_if

---
 include/hipSYCL/algorithms/algorithm.hpp      | 84 +++++++++++++++----
 include/hipSYCL/algorithms/numeric.hpp        | 54 ++----------
 .../std/stdpar/pstl-impl/algorithm.hpp        | 41 +++++++--
 tests/pstl/copy_if.cpp                        | 11 ++-
 4 files changed, 113 insertions(+), 77 deletions(-)

diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index ce9d6b884..e7529ffdb 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -15,6 +15,7 @@
 #include <iterator>
 #include <limits>
 #include <type_traits>
+#include <cstring>
 #include "hipSYCL/sycl/libkernel/accessor.hpp"
 #include "hipSYCL/sycl/libkernel/atomic_builtins.hpp"
 #include "hipSYCL/sycl/libkernel/memory.hpp"
@@ -22,11 +23,13 @@
 #include "hipSYCL/sycl/event.hpp"
 #include "hipSYCL/sycl/queue.hpp"
 #include "merge/merge.hpp"
+#include "scan/scan.hpp"
 #include "util/traits.hpp"
 #include "hipSYCL/algorithms/util/allocation_cache.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
 #include "hipSYCL/algorithms/sort/bitonic_sort.hpp"
 #include "hipSYCL/algorithms/merge/merge.hpp"
+#include "hipSYCL/algorithms/scan/scan.hpp"
 
 namespace hipsycl::algorithms {
 
@@ -167,24 +170,71 @@ sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
   }
 }
 
-
-template<class ForwardIt1, class ForwardIt2, class UnaryPredicate >
-sycl::event copy_if(sycl::queue& q,
-                    ForwardIt1 first, ForwardIt1 last,
-                    ForwardIt2 d_first,
-                    UnaryPredicate pred) {
-  if(first == last)
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate>
+sycl::event copy_if(sycl::queue &q, util::allocation_group &scratch_allocations,
+                    ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+                    UnaryPredicate pred,
+                    std::size_t *num_elements_copied = nullptr,
+                    const std::vector<sycl::event> &deps = {}) {
+  if(first == last) {
+    if(num_elements_copied)
+      *num_elements_copied = 0;
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
-                        [=](sycl::id<1> id) {
-                          auto input = first;
-                          auto output = d_first;
-                          std::advance(input, id[0]);
-                          std::advance(output, id[0]);
-                          auto input_v = *input;
-                          if(pred(input_v))
-                            *output = input_v;
-                        });
+  }
+
+  // TODO: We could optimize by switching between 32/64 bit types
+  // depending on problem size
+  using ScanT = std::size_t;
+
+  auto generator = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size) {
+    if(effective_global_id >= problem_size)
+      return ScanT{0};
+
+    ForwardIt1 it = first;
+    std::advance(it, effective_global_id);
+    if(pred(*it))
+      return ScanT{1};
+
+    return ScanT{0};
+  };
+
+  auto result_processor = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size,
+                       auto value) {
+    if (effective_global_id < problem_size) {
+      ForwardIt2 output = d_first;
+      ForwardIt1 input = first;
+      std::advance(input, effective_global_id);
+      std::advance(output, value);
+
+      bool needs_copy = false;
+
+      if(effective_global_id < problem_size) {
+        auto input_value = *input;
+        needs_copy = pred(input_value);
+        if(needs_copy)
+          *output = *input;
+      }
+
+      if (effective_global_id == problem_size - 1 && num_elements_copied) {
+        ScanT inclusive_scan_result = value;
+        // We did an exclusive scan, so if the last element also was copied,
+        // we need to add that.
+        if(needs_copy)
+          ++inclusive_scan_result;
+        
+        *num_elements_copied = static_cast<std::size_t>(inclusive_scan_result);
+      }
+    }
+  };
+
+  std::size_t problem_size = std::distance(first, last);
+
+  constexpr bool is_inclusive_scan = false;
+  return scanning::generate_scan_process<is_inclusive_scan, ScanT>(
+      q, scratch_allocations, problem_size, sycl::plus<>{},
+      ScanT{0}, generator, result_processor, deps);
 }
 
 template<class ForwardIt1, class Size, class ForwardIt2 >
diff --git a/include/hipSYCL/algorithms/numeric.hpp b/include/hipSYCL/algorithms/numeric.hpp
index b565d15e9..e37aa240e 100644
--- a/include/hipSYCL/algorithms/numeric.hpp
+++ b/include/hipSYCL/algorithms/numeric.hpp
@@ -276,56 +276,14 @@ sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
 
 ///////////////////////////// scans /////////////////////////////////////
 
-namespace detail {
-
-template <bool IsInclusive, class InputIt, class OutputIt, class BinaryOp,
-          class OptionalInitT>
-sycl::event scan(sycl::queue &q, util::allocation_group &scratch_allocations,
-                 InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
-                 OptionalInitT init,
-                 const std::vector<sycl::event> &deps = {}) {
-
-  auto generator = [=](auto idx, auto effective_group_id, auto effective_global_id,
-                 auto problem_size) {
-    if(effective_global_id >= problem_size)
-      effective_global_id = problem_size - 1;
-
-    InputIt it = first;
-    std::advance(it, effective_global_id);
-    return *it;
-  };
-  auto result_processor = [=](auto idx, auto effective_group_id,
-                       auto effective_global_id, auto problem_size,
-                       auto value) {
-    if (effective_global_id < problem_size) {
-      OutputIt it = d_first;
-      std::advance(it, effective_global_id);
-      *it = value;
-    }
-  };
-
-  std::size_t problem_size = std::distance(first, last);
-  std::size_t group_size = 128;
-  if(q.get_device().AdaptiveCpp_device_id().get_backend() == sycl::backend::omp) {
-    group_size = 1024;
-  }
-
-  using T = std::decay_t<decltype(*first)>;
-  return scanning::decoupled_lookback_scan<IsInclusive, T>(
-      q, scratch_allocations, generator, result_processor, op, problem_size,
-      group_size, init, deps);
-}
-
-} // detail
-
 template <class InputIt, class OutputIt, class BinaryOp>
 sycl::event
 inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
                InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
                const std::vector<sycl::event> &deps = {}) {
 
-  return detail::scan<true>(q, scratch_allocations, first, last, d_first, op,
-                            std::nullopt, deps);
+  return scanning::scan<true>(q, scratch_allocations, first, last, d_first, op,
+                              std::nullopt, deps);
 }
 
 template <class InputIt, class OutputIt, class BinaryOp, class T>
@@ -333,8 +291,8 @@ sycl::event
 inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
                InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
                T init, const std::vector<sycl::event> &deps = {}) {
-  return detail::scan<true>(q, scratch_allocations, first, last, d_first, op,
-                            init, deps);
+  return scanning::scan<true>(q, scratch_allocations, first, last, d_first, op,
+                              init, deps);
 }
 
 template <class InputIt, class OutputIt>
@@ -351,8 +309,8 @@ sycl::event
 exclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
                InputIt first, InputIt last, OutputIt d_first, T init,
                BinaryOp op, const std::vector<sycl::event> &deps = {}) {
-  return detail::scan<false>(q, scratch_allocations, first, last, d_first, op,
-                             init, deps);
+  return scanning::scan<false>(q, scratch_allocations, first, last, d_first, op,
+                               init, deps);
 }
 
 template <class InputIt, class OutputIt, class T>
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
index e4c3f611b..89e2c085f 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
@@ -20,6 +20,7 @@
 #include "../detail/stdpar_defs.hpp"
 #include "../detail/offload.hpp"
 #include "hipSYCL/algorithms/algorithm.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
 #include "hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp"
 
 namespace std {
@@ -153,9 +154,23 @@ ForwardIt2 copy_if(hipsycl::stdpar::par_unseq,
                    ForwardIt2 d_first,
                    UnaryPredicate pred) {
   auto offloader = [&](auto& queue){
+    auto output_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::host>();
+    auto device_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+    std::size_t *num_elements_copied =
+        output_scratch_group.obtain<std::size_t>(1);
+    
+    hipsycl::algorithms::copy_if(queue, device_scratch_group, first, last,
+                                 d_first, pred, num_elements_copied);
+    queue.wait();
+
     ForwardIt2 d_last = d_first;
-    std::advance(d_last, std::distance(first, last));
-    hipsycl::algorithms::copy_if(queue, first, last, d_first, pred);
+    std::advance(d_last, *num_elements_copied);
     return d_last;
   };
 
@@ -164,7 +179,7 @@ ForwardIt2 copy_if(hipsycl::stdpar::par_unseq,
                         d_first, pred);
   };
 
-  HIPSYCL_STDPAR_OFFLOAD(
+  HIPSYCL_STDPAR_BLOCKING_OFFLOAD(
       hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::copy_if{},
                                  hipsycl::stdpar::par_unseq{}),
       std::distance(first, last), ForwardIt2, offloader, fallback, first,
@@ -725,9 +740,23 @@ ForwardIt2 copy_if(hipsycl::stdpar::par,
                    ForwardIt2 d_first,
                    UnaryPredicate pred) {
   auto offloader = [&](auto& queue){
+    auto output_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::host>();
+    auto device_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+    std::size_t *num_elements_copied =
+        output_scratch_group.obtain<std::size_t>(1);
+    
+    hipsycl::algorithms::copy_if(queue, device_scratch_group, first, last,
+                                 d_first, pred, num_elements_copied);
+    queue.wait();
+
     ForwardIt2 d_last = d_first;
-    std::advance(d_last, std::distance(first, last));
-    hipsycl::algorithms::copy_if(queue, first, last, d_first, pred);
+    std::advance(d_last, *num_elements_copied);
     return d_last;
   };
 
@@ -736,7 +765,7 @@ ForwardIt2 copy_if(hipsycl::stdpar::par,
                         d_first, pred);
   };
 
-  HIPSYCL_STDPAR_OFFLOAD(
+  HIPSYCL_STDPAR_BLOCKING_OFFLOAD(
       hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::copy_if{},
                                  hipsycl::stdpar::par{}),
       std::distance(first, last), ForwardIt2, offloader, fallback, first,
diff --git a/tests/pstl/copy_if.cpp b/tests/pstl/copy_if.cpp
index eacce1e8e..f58d4ad65 100644
--- a/tests/pstl/copy_if.cpp
+++ b/tests/pstl/copy_if.cpp
@@ -35,13 +35,12 @@ void test_copy_if(std::size_t problem_size, Generator&& gen) {
 
   auto ret = std::copy_if(std::execution::par_unseq, data.begin(), data.end(),
                           dest_device.begin(), p);
-  std::copy_if(data.begin(), data.end(), dest_host.begin(), p);
+  auto ret_reference = std::copy_if(data.begin(), data.end(), dest_host.begin(), p);
 
-  BOOST_CHECK(ret == dest_device.begin() + problem_size);
-  // Our copy_if implementation is currently incorrect, since
-  // we always copy results to the same position (we would
-  // actually need to run a scan algorithm to find the right place)
-  //BOOST_CHECK(dest_device == dest_host);
+  BOOST_CHECK(std::distance(dest_device.begin(), ret) ==
+              std::distance(dest_host.begin(), ret_reference));
+
+  BOOST_CHECK(dest_device == dest_host);
 }
 
 BOOST_AUTO_TEST_CASE(par_unseq_empty) {

From 2bf324645ac680cca8df4fdada2ec905a4228489 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 26 Nov 2024 22:41:45 +0100
Subject: [PATCH 079/126] Temporarily disable massive data type tests for scans

---
 tests/pstl/exclusive_scan.cpp | 13 ++++++-------
 tests/pstl/inclusive_scan.cpp | 15 +++++++--------
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/tests/pstl/exclusive_scan.cpp b/tests/pstl/exclusive_scan.cpp
index 0784a247a..592b3b049 100644
--- a/tests/pstl/exclusive_scan.cpp
+++ b/tests/pstl/exclusive_scan.cpp
@@ -27,18 +27,17 @@ template<class T, std::size_t PaddingSize>
 struct non_default_constructible {
 public:
   static auto make(T x){
-    non_default_constructible<T, PaddingSize> t; t.x = x;
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
     return t;
   }
 
   T get() const {
-    return x;
+    return data[0];
   }
 private:
   non_default_constructible(){}
-  T x;
-  std::array<uint64_t, (PaddingSize + sizeof(uint64_t) - 1) / sizeof(uint64_t)>
-      padding;
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
 };
 
 template<class T>
@@ -137,13 +136,13 @@ void run_all_tests(Policy&& pol) {
             non_constructible_t::make(3ull),
             get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
   
-  using massive_non_constructible_t =
+  /*using massive_non_constructible_t =
       non_default_constructible<std::size_t, 1024>;
   test_scan(std::execution::par_unseq,
             get_non_constructible_generator<massive_non_constructible_t>(),
             massive_non_constructible_t::make(3ull),
             get_non_constructible_bin_op<massive_non_constructible_t>(),
-            ProblemSize);
+            ProblemSize);*/
 }
 
 BOOST_AUTO_TEST_CASE(par_unseq_empty) {
diff --git a/tests/pstl/inclusive_scan.cpp b/tests/pstl/inclusive_scan.cpp
index 3c8ba3134..942c510b5 100644
--- a/tests/pstl/inclusive_scan.cpp
+++ b/tests/pstl/inclusive_scan.cpp
@@ -27,18 +27,17 @@ template<class T, std::size_t PaddingSize>
 struct non_default_constructible {
 public:
   static auto make(T x){
-    non_default_constructible<T, PaddingSize> t; t.x = x;
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
     return t;
   }
 
   T get() const {
-    return x;
+    return data[0];
   }
 private:
   non_default_constructible(){}
-  T x;
-  std::array<uint64_t, (PaddingSize + sizeof(uint64_t) - 1) / sizeof(uint64_t)>
-      padding;
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
 };
 
 template<class T>
@@ -145,13 +144,13 @@ void run_all_tests(Policy&& pol) {
             non_constructible_t::make(3ull),
             get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
   
-  using massive_non_constructible_t =
-      non_default_constructible<std::size_t, 1024>;
+  /*using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 4>;
   test_scan(std::execution::par_unseq,
             get_non_constructible_generator<massive_non_constructible_t>(),
             massive_non_constructible_t::make(3ull),
             get_non_constructible_bin_op<massive_non_constructible_t>(),
-            ProblemSize);
+            ProblemSize);*/
 }
 
 BOOST_AUTO_TEST_CASE(par_unseq_empty) {

From bf9fa7688816eaec4f09a7b045ab417b1cf2e5dc Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 5 Dec 2024 04:36:01 +0100
Subject: [PATCH 080/126] [CUDA][SSCP][SMCP] Fix bug in NVIDIA fallback atomic
 load/stores

---
 .../generic/hiplike/atomic_builtins.hpp       |  5 ++-
 src/libkernel/sscp/ptx/atomic.cpp             | 42 +++++++++++--------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
index d5e3a0d56..f4e5c42b4 100644
--- a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
+++ b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
@@ -53,6 +53,7 @@ __attribute__((always_inline))
 HIPSYCL_HIPLIKE_BUILTIN
 void __acpp_cuda_atomic_store_device_rel_i32(int32_t *ptr, int32_t x) {
 #if __CUDA_ARCH__ < 700
+  __threadfence();
   *ptr = x;
   __threadfence();
 #else
@@ -68,7 +69,9 @@ HIPSYCL_HIPLIKE_BUILTIN
 int32_t __acpp_cuda_atomic_load_device_acq_i32(int32_t *ptr) {
 #if __CUDA_ARCH__ < 700
   __threadfence();
-  return *ptr;
+  int32_t res = *ptr;
+  __threadfence();
+  return res;
 #else
   int32_t result;
   asm volatile("ld.acquire.gpu.u32 %0,[%1];"
diff --git a/src/libkernel/sscp/ptx/atomic.cpp b/src/libkernel/sscp/ptx/atomic.cpp
index 45b9c2dce..50282cd96 100644
--- a/src/libkernel/sscp/ptx/atomic.cpp
+++ b/src/libkernel/sscp/ptx/atomic.cpp
@@ -421,8 +421,6 @@ unsigned long long __ullAtomicXor_system(unsigned long long *__p,
 
 
 
-
-
 // ********************** atomic store ***************************
 
 // Unlike the CUDA compilation flow, the __atomic_store and __atomic_load builtin
@@ -440,18 +438,32 @@ void mem_fence(__acpp_sscp_memory_scope fence_scope) {
   }
 }
 
+
+template<class T>
+T memfenced_load(T* ptr, __acpp_sscp_memory_scope scope) {
+  mem_fence(scope);
+  T x = *ptr;
+  mem_fence(scope);
+  return x;
+}
+
+template<class T>
+void memfenced_store(T* ptr, T x, __acpp_sscp_memory_scope scope) {
+  mem_fence(scope);
+  *ptr = x;
+  mem_fence(scope);
+}
+
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i8(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int8 *ptr, __acpp_int8 x) {
-  *ptr = x;
-  mem_fence(scope);
+  memfenced_store(ptr, x, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i16(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int16 *ptr, __acpp_int16 x) {
-  *ptr = x;
-  mem_fence(scope);
+  memfenced_store(ptr, x, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i32(
@@ -476,15 +488,13 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i32(
       }
     }
   }
-  *ptr = x;
-  mem_fence(scope);
+  memfenced_store(ptr, x, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i64(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int64 *ptr, __acpp_int64 x) {
-  *ptr = x;
-  mem_fence(scope);
+  memfenced_store(ptr, x, scope);
 }
 
 
@@ -493,15 +503,13 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i64(
 HIPSYCL_SSCP_BUILTIN __acpp_int8 __acpp_sscp_atomic_load_i8(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int8 *ptr) {
-  mem_fence(scope);
-  return *ptr;
+  return memfenced_load(ptr, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int16 __acpp_sscp_atomic_load_i16(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int16 *ptr) {
-  mem_fence(scope);
-  return *ptr;
+  return memfenced_load(ptr, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_load_i32(
@@ -529,15 +537,13 @@ HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_load_i32(
       } 
     }
   }
-  mem_fence(scope);
-  return *ptr;
+  return memfenced_load(ptr, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int64 __acpp_sscp_atomic_load_i64(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int64 *ptr) {
-  mem_fence(scope);
-  return *ptr;
+  return memfenced_load(ptr, scope);
 }
 
 // for internal use only, not part of the public API

From 34bebb1db2736661eae8830261032713cf864fe1 Mon Sep 17 00:00:00 2001
From: Marco Julian Solanki <173357676+marcosolanki@users.noreply.github.com>
Date: Thu, 5 Dec 2024 05:12:57 +0100
Subject: [PATCH 081/126] Set CMake policy CMP0167 to suppress FindBoost
 removal warning (#1624)

* Set CMake policy CMP0167 to suppress FindBoost removal warning

* Replace CMake version checks with policy availability checks

Co-authored-by: Andrey Alekseenko <al42and@gmail.com>

* Set CMP0167 to OLD to avoid breakage in the Windows & macOS CI tests

---------

Co-authored-by: Andrey Alekseenko <al42and@gmail.com>
---
 CMakeLists.txt | 60 +++++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02b9e4529..8f2c3a79a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,13 @@
 cmake_minimum_required(VERSION 3.10)
-if(NOT CMAKE_VERSION VERSION_LESS 3.12)
+
+if(POLICY CMP0074) # Since CMake 3.12
   cmake_policy(SET CMP0074 NEW) # Don't complain about using BOOST_ROOT...
 endif()
+if(POLICY CMP0167) # Since CMake 3.30
+    # Suppress warning: "Policy CMP0167 is not set: The FindBoost module is removed."
+    # Set CMP0167 to OLD to avoid breakage in the Windows & macOS CI tests.
+    cmake_policy(SET CMP0167 OLD)
+endif()
 
 
 set(ACPP_VERSION_MAJOR 24)
@@ -52,16 +58,16 @@ if(NOT ACPP_VERSION_SUFFIX)
       RESULT_VARIABLE GIT_LOCAL_CHANGES_RETURN_CODE
       OUTPUT_STRIP_TRAILING_WHITESPACE
     )
-  
-    if (GIT_HASH_RETURN_CODE EQUAL 0 AND GIT_BRANCH_RETURN_CODE EQUAL 0 AND 
+
+    if (GIT_HASH_RETURN_CODE EQUAL 0 AND GIT_BRANCH_RETURN_CODE EQUAL 0 AND
         GIT_DATE_RETURN_CODE EQUAL 0 AND GIT_LOCAL_CHANGES_RETURN_CODE EQUAL 0)
-    
+
       if(NOT "${ACPP_LOCAL_CHANGES}" STREQUAL "")
         set(DIRTY_STR ".dirty")
       else()
         set(DIRTY_STR "")
       endif()
-  
+
       set(ACPP_VERSION_SUFFIX "+git.${ACPP_GIT_COMMIT_HASH}.${ACPP_GIT_DATE}.branch.${ACPP_GIT_BRANCH}${DIRTY_STR}")
     endif()
   endif()
@@ -148,13 +154,13 @@ elseif(ACPP_COMPILER_FEATURE_PROFILE STREQUAL "custom-deprecated")
   if(DEFINED WITH_ACCELERATED_CPU)
     set(ACPP_CUSTOM_PROFILE_WITH_ACCELERATED_CPU ${WITH_ACCELERATED_CPU} CACHE INTERNAL "(deprecated) custom profile")
   endif()
-  
+
   # Ensure that these argument do not enter cache, otherwise the if(defined) checks above
   # will no longer work.
   unset(WITH_SSCP_COMPILER CACHE)
   unset(WITH_STDPAR_COMPILER CACHE)
   unset(WITH_ACCELERATED_CPU CACHE)
-  
+
   set(WITH_SSCP_COMPILER ${ACPP_CUSTOM_PROFILE_WITH_SSCP_COMPILER})
   set(WITH_STDPAR_COMPILER ${ACPP_CUSTOM_PROFILE_WITH_STDPAR_COMPILER})
   set(WITH_ACCELERATED_CPU ${ACPP_CUSTOM_PROFILE_WITH_ACCELERATED_CPU})
@@ -200,7 +206,7 @@ endif()
 if(WITH_ROCM_BACKEND)
   if(NOT ROCM_FOUND)
     #  message(SEND_ERROR "hipcc was not found")
-  
+
     # User has requested ROCm, but we could not find hipcc.
     # this is not necessarily a reason to abort,
     # since we only need libhip_hcc, the HIP includes,
@@ -298,7 +304,7 @@ if(BUILD_CLANG_PLUGIN)
   get_filename_component(LLVM_BIN_DIR "${CLANG_EXECUTABLE_PATH}" DIRECTORY)
   get_filename_component(LLVM_PREFIX_DIR "${LLVM_BIN_DIR}" DIRECTORY)
   # The path to the internal clang includes is currently required on ROCm
-  # to let acpp fix a wrong order of system includes (clang's internal 
+  # to let acpp fix a wrong order of system includes (clang's internal
   # includes are not of high enough priority in the include path search order).
   # We identify this path as the one containing __clang_cuda_runtime_wrapper.h,
   # which is a clang-specific header file.
@@ -318,7 +324,7 @@ if(BUILD_CLANG_PLUGIN)
     # Required for newer ROCm versions
     set(CLANG_INCLUDE_PATH ${FOUND_CLANG_INCLUDE_PATH}/..)
   endif()
-  
+
   if(NOT EXISTS ${CLANG_INCLUDE_PATH})
     message(SEND_ERROR "clang include path ${CLANG_INCLUDE_PATH} does not exist. Please provide clang's internal include path manually: Find the directory where __clang_cuda_runtime_wrapper.h is. Provide this directory for older ROCm versions and the parent directory for newer ones.")
   endif()
@@ -334,7 +340,7 @@ if(BUILD_CLANG_PLUGIN)
       message(STATUS "AMD clang version: ${ROCM_VERSION_MAJOR}.${ROCM_VERSION_MINOR}.${ROCM_VERSION_PATCH}")
     endif()
   endif()
-  
+
   if(${LLVM_VERSION_MAJOR} LESS 14)
     if(${WITH_ACCELERATED_CPU} OR ${WITH_SSCP_COMPILER} OR ${WITH_STDPAR_COMPILER})
       message(WARNING "clang version too old (${LLVM_VERSION_MAJOR} < 14) to be used with advanced AdaptiveCpp compiler features, disabling WITH_STDPAR_COMPILER, WITH_ACCELERATED_CPU, WITH_SSCP_COMPILER")
@@ -345,8 +351,8 @@ if(BUILD_CLANG_PLUGIN)
     endif()
   endif()
   message(STATUS "Using clang include directory: ${CLANG_INCLUDE_PATH}")
-  
-# Check if building on Windows and LLVM_LIBS is set, if not, use LLVM_AVAILABLE_LIBS 
+
+# Check if building on Windows and LLVM_LIBS is set, if not, use LLVM_AVAILABLE_LIBS
   if(WIN32 AND NOT LLVM_LIBS AND LLVM_AVAILABLE_LIBS)
     llvm_map_components_to_libnames(LLVM_LIBS analysis core support passes)
   endif()
@@ -362,7 +368,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
 set(ACPP_CONFIG_FILE_PATH "${PROJECT_BINARY_DIR}")
-set(ACPP_CONFIG_FILE_GLOBAL_INSTALLATION false CACHE BOOL 
+set(ACPP_CONFIG_FILE_GLOBAL_INSTALLATION false CACHE BOOL
   "Whether to install the AdaptiveCpp configuration files into a global directory (typically, /etc/AdaptiveCpp). This is generally not recommended.")
 
 if(ACPP_CONFIG_FILE_GLOBAL_INSTALLATION)
@@ -386,7 +392,7 @@ endif()
 
 if(APPLE)
   set(DEFAULT_OMP_FLAG "-Xclang -fopenmp")
-  
+
   if(Boost_FIBER_LIBRARY_DEBUG)
     set(DEFAULT_BOOST_LIBRARIES "${Boost_CONTEXT_LIBRARY_DEBUG} ${Boost_FIBER_LIBRARY_DEBUG} -Wl,-rpath ${Boost_LIBRARY_DIR}")
   else()
@@ -451,7 +457,7 @@ if(WIN32)
   if(NOT OMP_LINK_LINE)
     set(OMP_LINK_LINE ${DEFAULT_WIN32_OMP_LINK_LINE} CACHE STRING "Arguments passed to compiler to link OpenMP libraries to SYCL applications")
   endif()
-  if(NOT SEQUENTIAL_LINK_LINE) 
+  if(NOT SEQUENTIAL_LINK_LINE)
     set(SEQUENTIAL_LINK_LINE ${DEFAULT_WIN32_SEQUENTIAL_LINK_LINE} CACHE STRING "Arguments passed to compiler to link host libraries to SYCL applications")
   endif()
 elseif(APPLE)
@@ -461,7 +467,7 @@ elseif(APPLE)
   if(NOT OMP_LINK_LINE)
     set(OMP_LINK_LINE ${DEFAULT_APPLE_OMP_LINK_LINE} CACHE STRING "Arguments passed to compiler to link OpenMP libraries to SYCL applications")
   endif()
-  if(NOT SEQUENTIAL_LINK_LINE) 
+  if(NOT SEQUENTIAL_LINK_LINE)
     set(SEQUENTIAL_LINK_LINE ${DEFAULT_APPLE_SEQUENTIAL_LINK_LINE} CACHE STRING "Arguments passed to compiler to link host libraries to SYCL applications")
   endif()
 else()
@@ -471,33 +477,33 @@ else()
   if(NOT OMP_LINK_LINE)
     set(OMP_LINK_LINE ${DEFAULT_OMP_LINK_LINE} CACHE STRING "Arguments passed to compiler to link OpenMP libraries to SYCL applications")
   endif()
-  if(NOT SEQUENTIAL_LINK_LINE) 
+  if(NOT SEQUENTIAL_LINK_LINE)
     set(SEQUENTIAL_LINK_LINE ${DEFAULT_SEQUENTIAL_LINK_LINE} CACHE STRING "Arguments passed to compiler to link host libraries to SYCL applications")
   endif()
 endif()
 
 # If no compile flags given, set to default.
 if(NOT ROCM_CXX_FLAGS)
-  # clang erroneously sets feature detection flags for 
+  # clang erroneously sets feature detection flags for
   # __float128 even though it is not supported for CUDA / HIP,
   # see https://bugs.llvm.org/show_bug.cgi?id=47559.
 
   set(ROCM_CXX_FLAGS "-isystem $HIPSYCL_PATH/include/AdaptiveCpp/hipSYCL/std/hiplike -isystem ${CLANG_INCLUDE_PATH} -U__FLOAT128__ -U__SIZEOF_FLOAT128__ -I$HIPSYCL_ROCM_PATH/include -I$HIPSYCL_ROCM_PATH/include --rocm-device-lib-path=$HIPSYCL_ROCM_PATH/amdgcn/bitcode --rocm-path=$HIPSYCL_ROCM_PATH -fhip-new-launch-api -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -D__HIP_ROCclr__" CACHE STRING "Arguments passed to compiler to compile SYCL applications with ROCm")
 endif()
 
-if(NOT CUDA_CXX_FLAGS)	
-  # clang erroneously sets feature detection flags for 
+if(NOT CUDA_CXX_FLAGS)
+  # clang erroneously sets feature detection flags for
   # __float128 even though it is not supported for CUDA / HIP,
   # see https://bugs.llvm.org/show_bug.cgi?id=47559.
   set(CUDA_CXX_FLAGS "-U__FLOAT128__ -U__SIZEOF_FLOAT128__ -isystem $HIPSYCL_PATH/include/AdaptiveCpp/hipSYCL/std/hiplike" CACHE STRING "Arguments passed to compiler to compile SYCL applications with CUDA")
 endif()
 
 # always need -D_ENABLE_EXTENDED_ALIGNED_STORAGE to allow correctly aligned local memory on CPU
-if(NOT OMP_CXX_FLAGS) 
+if(NOT OMP_CXX_FLAGS)
   set(OMP_CXX_FLAGS "-I${Boost_INCLUDE_DIR} ${DEFAULT_OMP_FLAG} -D_ENABLE_EXTENDED_ALIGNED_STORAGE" CACHE STRING "Arguments passed to compiler to compile SYCL applications with OpenMP")
 endif()
 
-if(NOT SEQUENTIAL_CXX_FLAGS) 
+if(NOT SEQUENTIAL_CXX_FLAGS)
   set(SEQUENTIAL_CXX_FLAGS "-I${Boost_INCLUDE_DIR} -D_ENABLE_EXTENDED_ALIGNED_STORAGE" CACHE STRING "Arguments passed to compiler to compile SYCL applications on host")
 endif()
 
@@ -511,13 +517,13 @@ set(DEFAULT_GPU_ARCH "" CACHE STRING "(Deprecated, use DEFAULT_TARGETS instead)
 set(DEFAULT_TARGETS "" CACHE STRING "Default targets to compile for")
 
 if(NOT DEFAULT_TARGETS)
-  if(DEFAULT_PLATFORM)  
+  if(DEFAULT_PLATFORM)
     message(DEPRECATION "DEFAULT_PLATFORM is deprecated; use DEFAULT_TARGETS instead.")
-    
+
     if(DEFAULT_PLATFORM STREQUAL "cpu")
       set(DEFAULT_TARGETS "omp")
     endif()
-    
+
     if(DEFAULT_GPU_ARCH)
       message(DEPRECATION "DEFAULT_GPU_ARCH is deprecated; use DEFAULT_TARGETS instead.")
 
@@ -528,7 +534,7 @@ if(NOT DEFAULT_TARGETS)
       else()
         message(SEND_ERROR "Invalid value for DEFAULT_PLATFORM: \"${DEFAULT_PLATFORM}\". When DEFAULT_GPU_ARCH is specified, only \"cuda\" and \"rocm\" are supported.")
       endif()
-      
+
     endif()
   elseif(DEFAULT_GPU_ARCH)
     message(DEPRECATION "DEFAULT_GPU_ARCH is deprecated; use DEFAULT_TARGETS instead.")

From 026e7219fdbfa2587100a6db86aa97b4047ad330 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 5 Dec 2024 17:38:01 +0100
Subject: [PATCH 082/126] Align definition of SYCL_LANGUAGE_VERSION with
 updated wording in spec (#1630)

---
 include/hipSYCL/sycl/sycl.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/hipSYCL/sycl/sycl.hpp b/include/hipSYCL/sycl/sycl.hpp
index 10a95c28a..3ff1e47b1 100644
--- a/include/hipSYCL/sycl/sycl.hpp
+++ b/include/hipSYCL/sycl/sycl.hpp
@@ -23,8 +23,8 @@
  #undef SYCL_LANGUAGE_VERSION
 #endif
 
-#define CL_SYCL_LANGUAGE_VERSION 202003
-#define SYCL_LANGUAGE_VERSION 202003
+#define CL_SYCL_LANGUAGE_VERSION 202012L
+#define SYCL_LANGUAGE_VERSION 202012L
 #define SYCL_FEATURE_SET_FULL
 
 #include "hipSYCL/glue/persistent_runtime.hpp"

From e8b8c3aa2b1420b1b5f2d277caae82ee1d8f7242 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 6 Dec 2024 05:56:53 +0100
Subject: [PATCH 083/126] Expose different OpenCL platforms as different SYCL
 platforms

---
 .../runtime/cuda/cuda_hardware_manager.hpp    |  4 ++
 include/hipSYCL/runtime/hardware.hpp          |  4 ++
 .../runtime/hip/hip_hardware_manager.hpp      |  3 ++
 .../runtime/ocl/ocl_hardware_manager.hpp      |  3 ++
 .../runtime/omp/omp_hardware_manager.hpp      |  3 ++
 .../runtime/ze/ze_hardware_manager.hpp        |  3 ++
 include/hipSYCL/sycl/context.hpp              | 36 ++++++++------
 include/hipSYCL/sycl/platform.hpp             | 48 ++++++++++++++-----
 src/runtime/cuda/cuda_hardware_manager.cpp    |  7 +++
 src/runtime/hip/hip_hardware_manager.cpp      |  7 +++
 src/runtime/ocl/ocl_hardware_manager.cpp      |  8 ++++
 src/runtime/omp/omp_hardware_manager.cpp      | 10 ++++
 src/runtime/ze/ze_hardware_manager.cpp        |  9 ++++
 13 files changed, 118 insertions(+), 27 deletions(-)

diff --git a/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp b/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp
index bc58a7d3c..4086609b9 100644
--- a/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp
@@ -52,6 +52,8 @@ class cuda_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~cuda_hardware_context();
 
   cuda_allocator* get_allocator() const;
@@ -74,6 +76,8 @@ class cuda_hardware_manager : public backend_hardware_manager
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
 
+  virtual std::size_t get_num_platforms() const override;
+
   virtual ~cuda_hardware_manager() {}
   
 private:
diff --git a/include/hipSYCL/runtime/hardware.hpp b/include/hipSYCL/runtime/hardware.hpp
index 654445fa2..13083fe8b 100644
--- a/include/hipSYCL/runtime/hardware.hpp
+++ b/include/hipSYCL/runtime/hardware.hpp
@@ -127,6 +127,8 @@ class hardware_context
 
   virtual std::string get_driver_version() const = 0;
   virtual std::string get_profile() const = 0;
+
+  virtual std::size_t get_platform_index() const= 0;
   
   virtual ~hardware_context(){}
 };
@@ -135,6 +137,8 @@ class backend_hardware_manager
 {
 public:
   virtual std::size_t get_num_devices() const = 0;
+  virtual std::size_t get_num_platforms() const = 0;
+
   virtual hardware_context *get_device(std::size_t index) = 0;
   virtual device_id get_device_id(std::size_t index) const = 0;
 
diff --git a/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp b/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
index 419f3dee9..e0ba8fbf8 100644
--- a/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
@@ -53,6 +53,8 @@ class hip_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~hip_hardware_context() {}
 
   hip_allocator* get_allocator() const;
@@ -75,6 +77,7 @@ class hip_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~hip_hardware_manager() {}
   
diff --git a/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp b/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp
index 156ae1a10..c204f0990 100644
--- a/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp
@@ -57,6 +57,8 @@ class ocl_hardware_context : public hardware_context
 
   virtual ~ocl_hardware_context();
 
+  virtual std::size_t get_platform_index() const override;
+
   ocl_allocator* get_allocator();
   ocl_usm* get_usm_provider();
 
@@ -86,6 +88,7 @@ class ocl_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~ocl_hardware_manager() {}
   
diff --git a/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp b/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp
index 74eac0e05..e38547799 100644
--- a/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp
@@ -40,6 +40,8 @@ class omp_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~omp_hardware_context() {}
 };
 
@@ -49,6 +51,7 @@ class omp_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~omp_hardware_manager(){}
 private:
diff --git a/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp b/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp
index 0b411c4ad..88a21e711 100644
--- a/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp
@@ -88,6 +88,8 @@ class ze_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~ze_hardware_context();
 
   ze_driver_handle_t get_ze_driver() const
@@ -119,6 +121,7 @@ class ze_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~ze_hardware_manager() {}
   
diff --git a/include/hipSYCL/sycl/context.hpp b/include/hipSYCL/sycl/context.hpp
index 0ef0987c3..5da23dd55 100644
--- a/include/hipSYCL/sycl/context.hpp
+++ b/include/hipSYCL/sycl/context.hpp
@@ -128,29 +128,35 @@ class context
   }
 
   platform get_platform() const {
-    bool found_device_backend = false;
-    rt::backend_id last_backend;
+    bool found_device_platform = false;
+    rt::platform_id last_platform;
 
     this->_impl->devices.for_each_backend([&](rt::backend_id b) {
-      if (b != detail::get_host_device().get_backend()) {
-        if (found_device_backend) {
-          // We already have a device backend
-          HIPSYCL_DEBUG_WARNING
-              << "context: get_platform() was called but this context spans "
-                 "multiple backends/platforms. Only returning last platform"
-              << std::endl;
+      rt::backend* backend = this->_impl->requires_runtime.get()->backends().get(b);
+
+      for (std::size_t platform_index = 0;
+           platform_index < backend->get_hardware_manager()->get_num_platforms();
+           ++platform_index) {
+        if (b != detail::get_host_device().get_backend()) {
+          if (found_device_platform) {
+            // We already have a device backend
+            HIPSYCL_DEBUG_WARNING
+                << "context: get_platform() was called but this context spans "
+                  "multiple backends/platforms. Only returning last platform"
+                << std::endl;
+          }
+          
+          last_platform = rt::platform_id{b, platform_index};
+          found_device_platform = true;
         }
-        
-        last_backend = b;
-        found_device_backend = true;
       }
     });
 
-    if (!found_device_backend) {
-      last_backend = detail::get_host_device().get_backend(); 
+    if (!found_device_platform) {
+      last_platform = rt::platform_id{detail::get_host_device().get_backend(), 0}; 
     }
 
-    return platform{last_backend};
+    return platform{last_platform};
   }
 
   vector_class<device> get_devices() const {
diff --git a/include/hipSYCL/sycl/platform.hpp b/include/hipSYCL/sycl/platform.hpp
index 153213e2b..e6a40c79f 100644
--- a/include/hipSYCL/sycl/platform.hpp
+++ b/include/hipSYCL/sycl/platform.hpp
@@ -12,6 +12,7 @@
 #define HIPSYCL_PLATFORM_HPP
 
 #include <vector>
+#include <string>
 
 #include "hipSYCL/runtime/application.hpp"
 #include "hipSYCL/runtime/backend.hpp"
@@ -22,6 +23,7 @@
 #include "info/info.hpp"
 #include "version.hpp"
 
+
 namespace hipsycl {
 namespace sycl {
 
@@ -32,13 +34,25 @@ class platform {
 public:
   platform() : _platform{detail::get_host_device().get_backend(), 0} {}
   
-  platform(rt::backend_id backend)
-      : _platform{backend, 0} {}
+  platform(rt::platform_id platform)
+  : _platform{platform} {}
+
+  platform(rt::backend_id backend, std::size_t platform_index)
+      : _platform{backend, platform_index} {}
 
   template<class DeviceSelector>
   explicit platform(const DeviceSelector &deviceSelector) {
     auto dev = detail::select_devices(deviceSelector)[0];
-    this->_platform = rt::platform_id{dev._device_id};
+    
+    rt::backend *b =
+        _requires_runtime.get()->backends().get(dev.get_backend());
+    std::size_t platform_index =
+        b->get_hardware_manager()
+            ->get_device(dev.AdaptiveCpp_device_id().get_id())
+            ->get_platform_index();
+
+    this->_platform =
+        rt::platform_id{dev.get_backend(), static_cast<int>(platform_index)};
   }
 
 
@@ -54,12 +68,15 @@ class platform {
       bool is_gpu = b->get_hardware_manager()->get_device(dev)->is_gpu();
 
       bool include_device = false;
-      if (type == info::device_type::all ||
-          (type == info::device_type::accelerator && is_gpu) ||
-          (type == info::device_type::gpu && is_gpu) ||
-          (type == info::device_type::host && is_cpu) ||
-          (type == info::device_type::cpu && is_cpu)) {
-        include_device = true;
+      if (b->get_hardware_manager()->get_device(dev)->get_platform_index() ==
+          _platform.get_platform()) {
+        if (type == info::device_type::all ||
+            (type == info::device_type::accelerator && is_gpu) ||
+            (type == info::device_type::gpu && is_gpu) ||
+            (type == info::device_type::host && is_cpu) ||
+            (type == info::device_type::cpu && is_cpu)) {
+          include_device = true;
+        }
       }
 
       if (include_device)
@@ -102,7 +119,10 @@ class platform {
     rt::runtime_keep_alive_token requires_runtime;
 
     requires_runtime.get()->backends().for_each_backend([&](rt::backend *b) {
-      result.push_back(platform{b->get_unique_backend_id()});
+      for (std::size_t i = 0;
+           i < b->get_hardware_manager()->get_num_platforms(); ++i) {
+        result.push_back(platform{b->get_unique_backend_id(), i});
+      }
     });
 
     return result;
@@ -148,7 +168,10 @@ HIPSYCL_SPECIALIZE_GET_INFO(platform, version)
 HIPSYCL_SPECIALIZE_GET_INFO(platform, name)
 {
   rt::backend_id b = _platform.get_backend();
-  return _requires_runtime.get()->backends().get(b)->get_name();
+  std::string platform_name =
+      _requires_runtime.get()->backends().get(b)->get_name();
+  platform_name +=
+      " (platform " + std::to_string(_platform.get_platform()) + ")";
 }
 
 HIPSYCL_SPECIALIZE_GET_INFO(platform, vendor)
@@ -162,7 +185,8 @@ HIPSYCL_SPECIALIZE_GET_INFO(platform, extensions)
 }
 
 inline platform device::get_platform() const  {
-  return platform{_device_id.get_backend()};
+  return platform{_device_id.get_backend(),
+                  static_cast<int>(get_rt_device()->get_platform_index())};
 }
 
 }// namespace sycl
diff --git a/src/runtime/cuda/cuda_hardware_manager.cpp b/src/runtime/cuda/cuda_hardware_manager.cpp
index e4e9819fa..83a073c87 100644
--- a/src/runtime/cuda/cuda_hardware_manager.cpp
+++ b/src/runtime/cuda/cuda_hardware_manager.cpp
@@ -85,6 +85,13 @@ device_id cuda_hardware_manager::get_device_id(std::size_t index) const {
                    static_cast<int>(index)};
 }
 
+std::size_t cuda_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+std::size_t cuda_hardware_context::get_platform_index() const {
+  return 0;
+}
 
 cuda_hardware_context::cuda_hardware_context(int dev) 
   : _dev{dev} {
diff --git a/src/runtime/hip/hip_hardware_manager.cpp b/src/runtime/hip/hip_hardware_manager.cpp
index a1839229b..7ced6676e 100644
--- a/src/runtime/hip/hip_hardware_manager.cpp
+++ b/src/runtime/hip/hip_hardware_manager.cpp
@@ -108,6 +108,13 @@ device_id hip_hardware_manager::get_device_id(std::size_t index) const {
                    static_cast<int>(index)};
 }
 
+std::size_t hip_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+std::size_t hip_hardware_context::get_platform_index() const {
+  return 0;
+}
 
 hip_hardware_context::hip_hardware_context(int dev) : _dev{dev} {
   _properties = std::make_unique<hipDeviceProp_t>();
diff --git a/src/runtime/ocl/ocl_hardware_manager.cpp b/src/runtime/ocl/ocl_hardware_manager.cpp
index 8747063a1..3cf9624b3 100644
--- a/src/runtime/ocl/ocl_hardware_manager.cpp
+++ b/src/runtime/ocl/ocl_hardware_manager.cpp
@@ -574,6 +574,14 @@ void ocl_hardware_context::init_allocator(ocl_hardware_manager *mgr) {
   _alloc = ocl_allocator{dev, _usm_provider.get()};
 }
 
+std::size_t ocl_hardware_context::get_platform_index() const {
+  return static_cast<std::size_t>(_platform_id);
+}
+
+std::size_t ocl_hardware_manager::get_num_platforms() const {
+  return _platforms.size();
+}
+
 ocl_hardware_manager::ocl_hardware_manager()
 : _hw_platform{hardware_platform::ocl} {
   const auto visibility_mask =
diff --git a/src/runtime/omp/omp_hardware_manager.cpp b/src/runtime/omp/omp_hardware_manager.cpp
index d36aad6c7..7eb0f9fba 100644
--- a/src/runtime/omp/omp_hardware_manager.cpp
+++ b/src/runtime/omp/omp_hardware_manager.cpp
@@ -294,6 +294,16 @@ std::string omp_hardware_context::get_profile() const {
   return "FULL_PROFILE";
 }
 
+std::size_t omp_hardware_context::get_platform_index() const {
+  return 0;
+}
+
+std::size_t omp_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+
+
 std::size_t omp_hardware_manager::get_num_devices() const { return 1; }
 
 
diff --git a/src/runtime/ze/ze_hardware_manager.cpp b/src/runtime/ze/ze_hardware_manager.cpp
index 9fe3aaada..8fb4e9971 100644
--- a/src/runtime/ze/ze_hardware_manager.cpp
+++ b/src/runtime/ze/ze_hardware_manager.cpp
@@ -512,6 +512,15 @@ uint32_t ze_hardware_context::get_ze_global_memory_ordinal() const {
   return result;
 }
 
+std::size_t ze_hardware_context::get_platform_index() const {
+  return 0;
+}
+
+std::size_t ze_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+
 ze_hardware_manager::ze_hardware_manager() {
 
   if (has_device_visibility_mask(

From bf0b12c72f275b7522f7ae2490e8e6f07f03a969 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 6 Dec 2024 06:12:47 +0100
Subject: [PATCH 084/126] Provide some basic information about platform in
 acpp-info

---
 src/tools/acpp-info/acpp-info.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/tools/acpp-info/acpp-info.cpp b/src/tools/acpp-info/acpp-info.cpp
index a469af8b1..02d2f8fd3 100644
--- a/src/tools/acpp-info/acpp-info.cpp
+++ b/src/tools/acpp-info/acpp-info.cpp
@@ -70,6 +70,11 @@ void list_device_details(rt::device_id dev, rt::backend *b,
   std::cout << " General device information:" << std::endl;
   print_info("Name", hw->get_device_name(), 2);
   print_info("Backend", b->get_name(), 2);
+  print_info("Platform",
+             "Backend " +
+                 std::to_string(static_cast<int>(b->get_unique_backend_id())) +
+                 " / Platform " + std::to_string(hw->get_platform_index()),
+             2);
   print_info("Vendor", hw->get_vendor_name(), 2);
   print_info("Arch", hw->get_device_arch(), 2);
   print_info("Driver version", hw->get_driver_version(), 2);

From 7bf15af14648de5636ca8ca633386eaf85283bc9 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 6 Dec 2024 06:37:33 +0100
Subject: [PATCH 085/126] Add acpp namespace alias

---
 include/hipSYCL/algorithms/algorithm.hpp      |  2 ++
 include/hipSYCL/algorithms/numeric.hpp        |  1 +
 .../hipSYCL/sycl/detail/namespace_compat.hpp  | 19 +++++++++++++++++++
 include/hipSYCL/sycl/sycl.hpp                 |  1 +
 4 files changed, 23 insertions(+)
 create mode 100644 include/hipSYCL/sycl/detail/namespace_compat.hpp

diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index ce9d6b884..00f1ad3cb 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -19,6 +19,7 @@
 #include "hipSYCL/sycl/libkernel/atomic_builtins.hpp"
 #include "hipSYCL/sycl/libkernel/memory.hpp"
 #include "hipSYCL/sycl/libkernel/functional.hpp"
+#include "hipSYCL/sycl/detail/namespace_compat.hpp"
 #include "hipSYCL/sycl/event.hpp"
 #include "hipSYCL/sycl/queue.hpp"
 #include "merge/merge.hpp"
@@ -28,6 +29,7 @@
 #include "hipSYCL/algorithms/sort/bitonic_sort.hpp"
 #include "hipSYCL/algorithms/merge/merge.hpp"
 
+
 namespace hipsycl::algorithms {
 
 namespace detail {
diff --git a/include/hipSYCL/algorithms/numeric.hpp b/include/hipSYCL/algorithms/numeric.hpp
index bf3bbbf3d..cf5b1da33 100644
--- a/include/hipSYCL/algorithms/numeric.hpp
+++ b/include/hipSYCL/algorithms/numeric.hpp
@@ -21,6 +21,7 @@
 #include "hipSYCL/sycl/libkernel/functional.hpp"
 #include "hipSYCL/sycl/event.hpp"
 #include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/sycl/detail/namespace_compat.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_descriptor.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_engine.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
diff --git a/include/hipSYCL/sycl/detail/namespace_compat.hpp b/include/hipSYCL/sycl/detail/namespace_compat.hpp
new file mode 100644
index 000000000..3ff1263ee
--- /dev/null
+++ b/include/hipSYCL/sycl/detail/namespace_compat.hpp
@@ -0,0 +1,19 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_NAMESPACE_COMPAT
+#define ACPP_NAMESPACE_COMPAT
+
+namespace acpp {
+  using namespace hipsycl;
+}
+
+#endif
diff --git a/include/hipSYCL/sycl/sycl.hpp b/include/hipSYCL/sycl/sycl.hpp
index 3ff1e47b1..2827f65ec 100644
--- a/include/hipSYCL/sycl/sycl.hpp
+++ b/include/hipSYCL/sycl/sycl.hpp
@@ -78,6 +78,7 @@
 #include "buffer_explicit_behavior.hpp"
 #include "specialized.hpp"
 #include "jit.hpp"
+#include "detail/namespace_compat.hpp"
 
 // Support SYCL_EXTERNAL for SSCP - we cannot have SYCL_EXTERNAL if accelerated CPU
 // is active at the same time :(

From 5461aa20a9636b699fcceca0115f62c80fa769d3 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 7 Dec 2024 01:56:08 +0100
Subject: [PATCH 086/126] Fix missing reurn for platform::info::name

Co-authored-by: Andrey Alekseenko <al42and@gmail.com>
---
 include/hipSYCL/sycl/platform.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hipSYCL/sycl/platform.hpp b/include/hipSYCL/sycl/platform.hpp
index e6a40c79f..4a61302a1 100644
--- a/include/hipSYCL/sycl/platform.hpp
+++ b/include/hipSYCL/sycl/platform.hpp
@@ -172,6 +172,7 @@ HIPSYCL_SPECIALIZE_GET_INFO(platform, name)
       _requires_runtime.get()->backends().get(b)->get_name();
   platform_name +=
       " (platform " + std::to_string(_platform.get_platform()) + ")";
+      return platform_name;
 }
 
 HIPSYCL_SPECIALIZE_GET_INFO(platform, vendor)

From c4eefb90d9c3b4a1f77c978be8414384df30e208 Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Sat, 7 Dec 2024 02:21:46 +0100
Subject: [PATCH 087/126] [cmake] clarify what variable needs to be defined by
 hand (#1629)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f2c3a79a..c1b4e4a10 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -326,7 +326,7 @@ if(BUILD_CLANG_PLUGIN)
   endif()
 
   if(NOT EXISTS ${CLANG_INCLUDE_PATH})
-    message(SEND_ERROR "clang include path ${CLANG_INCLUDE_PATH} does not exist. Please provide clang's internal include path manually: Find the directory where __clang_cuda_runtime_wrapper.h is. Provide this directory for older ROCm versions and the parent directory for newer ones.")
+    message(SEND_ERROR "CLANG_INCLUDE_PATH ${CLANG_INCLUDE_PATH} does not exist. Please provide clang's internal include path manually: Find the directory where __clang_cuda_runtime_wrapper.h is. Provide this directory for older ROCm versions and the parent directory for newer ones.")
   endif()
   if(WITH_ROCM_BACKEND)
     execute_process(COMMAND ${CLANG_EXECUTABLE_PATH} "--version"

From 2f47a114e540a5a6d809af80fe1b23f36fc890f1 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 7 Dec 2024 02:22:15 +0100
Subject: [PATCH 088/126] [NFC][doc] Fix incorrect environment variables for IR
 dump feature (#1627)

---
 doc/env_variables.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/env_variables.md b/doc/env_variables.md
index 618bc553e..e7fef2cf5 100644
--- a/doc/env_variables.md
+++ b/doc/env_variables.md
@@ -48,15 +48,15 @@ These environment variables take the shape `ACPP_S2_DUMP_IR_<Stage>` for various
 Within one application run, AdaptiveCpp appends IR dumps to the dump file. When a new application run results in new dumps being generated to the same file, the file will be truncated first.
 
 Available stages for dumping:
-* `ACPP_S2_IR_DUMP_INPUT` - dumps the raw, unoptimized generic input LLVM IR
-* `ACPP_S2_IR_DUMP_INITIAL_OUTLINING` - After initial kernel outlining
-* `ACPP_S2_IR_DUMP_SPECIALIZATION` - After applying specializations to the kernel
-* `ACPP_S2_IR_DUMP_REFLECTION` - After processing JIT-time reflection queries
-* `ACPP_S2_IR_DUMP_JIT_OPTIMIZATIONS` - After processing optimizations that rely on JIT-time information`
-* `ACPP_S2_IR_DUMP_BACKEND_FLAVORING` - After applying the "backend flavor", i.e. turning generic LLVM IR into IR that targets a specific backend.
-* `ACPP_S2_IR_DUMP_FULL_OPTIMIZATIONS` - After running the full LLVM optimization pipeline on the code.
-* `ACPP_S2_IR_DUMP_FINAL` - Final state of the LLVM IR before handing it off to lowering it to backend-specific formats (e.g. PTX, amdgcn ISA, SPIR-V).
-* `ACPP_S2_IR_DUMP_ALL` - Dump all stages.
+* `ACPP_S2_DUMP_IR_INPUT` - dumps the raw, unoptimized generic input LLVM IR
+* `ACPP_S2_DUMP_IR_INITIAL_OUTLINING` - After initial kernel outlining
+* `ACPP_S2_DUMP_IR_SPECIALIZATION` - After applying specializations to the kernel
+* `ACPP_S2_DUMP_IR_REFLECTION` - After processing JIT-time reflection queries
+* `ACPP_S2_DUMP_IR_JIT_OPTIMIZATIONS` - After processing optimizations that rely on JIT-time information
+* `ACPP_S2_DUMP_IR_BACKEND_FLAVORING` - After applying the "backend flavor", i.e. turning generic LLVM IR into IR that targets a specific backend.
+* `ACPP_S2_DUMP_IR_FULL_OPTIMIZATIONS` - After running the full LLVM optimization pipeline on the code.
+* `ACPP_S2_DUMP_IR_FINAL` - Final state of the LLVM IR before handing it off to lowering it to backend-specific formats (e.g. PTX, amdgcn ISA, SPIR-V).
+* `ACPP_S2_DUMP_IR_ALL` - Dump all stages.
 
 A dump section for a stage in the dump file will take the following form:
 ```

From a556ce6ed5dc99afb622c5f99a37bf9394bf2494 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Sat, 7 Dec 2024 01:43:19 +0000
Subject: [PATCH 089/126] replace c headers by std c++ headers

---
 src/libkernel/sscp/host/math.cpp               | 2 +-
 src/libkernel/sscp/host/print.cpp              | 2 +-
 src/libkernel/sscp/host/relational.cpp         | 2 +-
 tests/sycl/group_functions/group_functions.hpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/libkernel/sscp/host/math.cpp b/src/libkernel/sscp/host/math.cpp
index 1881230e8..9ee91df92 100644
--- a/src/libkernel/sscp/host/math.cpp
+++ b/src/libkernel/sscp/host/math.cpp
@@ -8,7 +8,7 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include <math.h>
+#include <cmath>
 
 #include "hipSYCL/sycl/libkernel/sscp/builtins/builtin_config.hpp"
 #include "hipSYCL/sycl/libkernel/sscp/builtins/math.hpp"
diff --git a/src/libkernel/sscp/host/print.cpp b/src/libkernel/sscp/host/print.cpp
index decb8890d..7801916f3 100644
--- a/src/libkernel/sscp/host/print.cpp
+++ b/src/libkernel/sscp/host/print.cpp
@@ -10,7 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/sycl/libkernel/sscp/builtins/print.hpp"
 
-#include <stdio.h>
+#include <cstdio>
 
 void __acpp_sscp_print(const char* msg) {
   puts(msg);
diff --git a/src/libkernel/sscp/host/relational.cpp b/src/libkernel/sscp/host/relational.cpp
index 4dd52b310..42e8e0c2c 100644
--- a/src/libkernel/sscp/host/relational.cpp
+++ b/src/libkernel/sscp/host/relational.cpp
@@ -10,7 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/sycl/libkernel/sscp/builtins/relational.hpp"
 
-#include <math.h>
+#include <cmath>
 
 #define HIPSYCL_SSCP_MAP_HOST_REL_BUILTIN(name)                                \
   HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_##name##_f32(float x) {        \
diff --git a/tests/sycl/group_functions/group_functions.hpp b/tests/sycl/group_functions/group_functions.hpp
index 93412dfa2..27ce74f5b 100644
--- a/tests/sycl/group_functions/group_functions.hpp
+++ b/tests/sycl/group_functions/group_functions.hpp
@@ -17,7 +17,7 @@
 #include <functional>
 #include <iostream>
 #include <limits>
-#include <math.h>
+#include <cmath>
 #include <type_traits>
 
 #include <sstream>

From 993cd28198b91547bd82918aedf8e164243b5dd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Sat, 7 Dec 2024 01:51:48 +0000
Subject: [PATCH 090/126] fix build

---
 src/libkernel/sscp/host/relational.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/libkernel/sscp/host/relational.cpp b/src/libkernel/sscp/host/relational.cpp
index 42e8e0c2c..1b4cc3908 100644
--- a/src/libkernel/sscp/host/relational.cpp
+++ b/src/libkernel/sscp/host/relational.cpp
@@ -29,8 +29,8 @@ HIPSYCL_SSCP_MAP_HOST_REL_BUILTIN(isfinite)
 HIPSYCL_SSCP_MAP_HOST_REL_BUILTIN(isnormal)
 
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_signbit_f32(float x) {
-  return signbit(x);
+  return std::signbit(x);
 }
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_signbit_f64(double x) {
-  return signbit(x);
+  return std::signbit(x);
 }

From 835b04359feee0a57165a8e2c92810049ec5d9cd Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 7 Dec 2024 03:47:00 +0100
Subject: [PATCH 091/126] default selector: Prefer OpenCL CPU over OpenMP CPU
 device

---
 include/hipSYCL/sycl/device_selector.hpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/hipSYCL/sycl/device_selector.hpp b/include/hipSYCL/sycl/device_selector.hpp
index c6a37d299..19b993a8d 100644
--- a/include/hipSYCL/sycl/device_selector.hpp
+++ b/include/hipSYCL/sycl/device_selector.hpp
@@ -108,7 +108,14 @@ inline int select_default(const device& dev) {
   } else if(dev.is_cpu()) {
     // Prefer CPU over GPUs that don't have compiled kernels
     // and cannot run kernels.
-    return 1;
+
+    // Prefer non-OpenMP CPU device since the OpenMP backend cannot be disabled,
+    // so there would be no way to select e.g. an OpenCL CPU device
+    // using ACPP_VISIBILITY_MASK otherwise.
+    if(dev.get_backend() != sycl::backend::omp)
+      return 1;
+    else
+      return 0;
   } else {
     // Never select GPUs without compiled kernels
     return -1;

From d9c4094aec7f5c317fd00fb74f8449fe1d62703c Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 7 Dec 2024 03:53:56 +0100
Subject: [PATCH 092/126] Remove outdated installation scripts

---
 install/scripts/README.md                     |  90 ------
 .../add-hipsycl-repo/archlinux-rolling.sh     |  12 -
 install/scripts/add-hipsycl-repo/centos-7.sh  |   6 -
 .../scripts/add-hipsycl-repo/ubuntu-18.04.sh  |   7 -
 .../scripts/add-hipsycl-repo/ubuntu-20.04.sh  |   5 -
 .../base-definitions/archlinux-rolling.def    |   9 -
 install/scripts/base-definitions/centos-7.def |  22 --
 .../scripts/base-definitions/ubuntu-18.04.def |  16 -
 install/scripts/hipsycl-archlinux-rolling.def |   8 -
 install/scripts/hipsycl-centos-7.def          |  10 -
 install/scripts/hipsycl-minimal-install.sh    |  24 --
 install/scripts/hipsycl-ubuntu-18.04.def      |   8 -
 install/scripts/install-base-spack.sh         |  25 --
 install/scripts/install-cuda.sh               |  10 -
 install/scripts/install-hipsycl.sh            |  39 ---
 install/scripts/install-llvm.sh               |  71 -----
 install/scripts/install-rocm.sh               |  55 ----
 install/scripts/packaging/common/init.sh      |  79 -----
 .../packaging/make-archlinux-cuda-pkg.sh      |  33 --
 .../scripts/packaging/make-archlinux-pkg.sh   | 239 ---------------
 .../scripts/packaging/make-centos-7-pkg.sh    | 271 -----------------
 .../scripts/packaging/make-centos-8-pkg.sh    | 282 ------------------
 .../scripts/packaging/make-centos-cuda-pkg.sh |  43 ---
 .../scripts/packaging/make-ubuntu-cuda-pkg.sh |  25 --
 install/scripts/packaging/make-ubuntu-pkg.sh  | 157 ----------
 install/scripts/rebuild-images.sh             |  71 -----
 install/scripts/spack-install/boost.sh        |  28 --
 install/scripts/spack-install/cuda.sh         |   1 -
 install/scripts/spack-install/hipsycl.sh      |  66 ----
 install/scripts/spack-install/llvm.sh         |  34 ---
 install/scripts/spack-install/rocm.sh         |  35 ---
 .../scripts/spack-install/spack-syclcc.json   |  19 --
 32 files changed, 1800 deletions(-)
 delete mode 100644 install/scripts/README.md
 delete mode 100644 install/scripts/add-hipsycl-repo/archlinux-rolling.sh
 delete mode 100644 install/scripts/add-hipsycl-repo/centos-7.sh
 delete mode 100644 install/scripts/add-hipsycl-repo/ubuntu-18.04.sh
 delete mode 100644 install/scripts/add-hipsycl-repo/ubuntu-20.04.sh
 delete mode 100644 install/scripts/base-definitions/archlinux-rolling.def
 delete mode 100644 install/scripts/base-definitions/centos-7.def
 delete mode 100644 install/scripts/base-definitions/ubuntu-18.04.def
 delete mode 100644 install/scripts/hipsycl-archlinux-rolling.def
 delete mode 100644 install/scripts/hipsycl-centos-7.def
 delete mode 100644 install/scripts/hipsycl-minimal-install.sh
 delete mode 100644 install/scripts/hipsycl-ubuntu-18.04.def
 delete mode 100644 install/scripts/install-base-spack.sh
 delete mode 100644 install/scripts/install-cuda.sh
 delete mode 100644 install/scripts/install-hipsycl.sh
 delete mode 100644 install/scripts/install-llvm.sh
 delete mode 100644 install/scripts/install-rocm.sh
 delete mode 100644 install/scripts/packaging/common/init.sh
 delete mode 100644 install/scripts/packaging/make-archlinux-cuda-pkg.sh
 delete mode 100644 install/scripts/packaging/make-archlinux-pkg.sh
 delete mode 100644 install/scripts/packaging/make-centos-7-pkg.sh
 delete mode 100644 install/scripts/packaging/make-centos-8-pkg.sh
 delete mode 100644 install/scripts/packaging/make-centos-cuda-pkg.sh
 delete mode 100644 install/scripts/packaging/make-ubuntu-cuda-pkg.sh
 delete mode 100644 install/scripts/packaging/make-ubuntu-pkg.sh
 delete mode 100644 install/scripts/rebuild-images.sh
 delete mode 100644 install/scripts/spack-install/boost.sh
 delete mode 120000 install/scripts/spack-install/cuda.sh
 delete mode 100644 install/scripts/spack-install/hipsycl.sh
 delete mode 100644 install/scripts/spack-install/llvm.sh
 delete mode 100644 install/scripts/spack-install/rocm.sh
 delete mode 100644 install/scripts/spack-install/spack-syclcc.json

diff --git a/install/scripts/README.md b/install/scripts/README.md
deleted file mode 100644
index a1d86d4fd..000000000
--- a/install/scripts/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# hipSYCL installation and packaging scripts
-
-We provide
-* Scripts to install hipSYCL and required LLVM, ROCm and CUDA stacks
-* Repositories for all supported distributions
-* Singularity definition files which allow to create singularity container images with hipSYCL
-* Pre-built singularity containers
-* Scripts to create binary packages of the entire stack for several distributions.
-
-Currently, we support
-* Ubuntu 18.04
-* Ubuntu 20.04
-* CentOS 7
-* Arch Linux
-
-## Installing from repositories
-Installing using the repositories is beneficial because the hipSYCL installation can be kept up to date with regular system updates. We provide stable packages subject to more rigorous testing and nightly packages built from the current development head.
-
-We provide the following packages in both versions:
-
-Base packages: 
-* `hipSYCL-base<-nightly>`
-* `hipSYCL-base-rocm<-nightly>`
-HipSYCL packages:
-* `hipSYCL-omp<-nightly>`
-* `hipSYCL-omp-cuda<-nightly>`
-* `hipSYCL-omp-rocm<-nightly>`
-* `hipSYCL-omp-rocm-cuda<-nightly>`
-Two meta-packages in order to keep consistent with the previous packages:
-* `hipSYCL-full<-nightly>` -> `hipSYCL-omp-rocm-cuda<-nightly>`
-* `hipSYCL<-nightly>` -> `hipSYCL-omp-rocm-cuda<-nightly>`
-
-We require some additional software repos to be enabled (for example, `release-scl` and `epel` for centos 7 ). To make adding these easier, we provide scripts in the `install/scripts/add-hipsycl-repo` for all supported distributions that handles adding these repositories, as well as adding the hipSYCL repo.
-
-## Installing by script
-Note that the installation scripts may require the installation of some packages, depending on your distributions. We recommend first looking at the singularity definition files `*.def` for your distribution and installing everything that is installed there. Afterwards, run
-
-* `sudo sh install-llvm.sh` - basic LLVM/clang stack required by hipSYCL
-* `sudo sh install-cuda.sh` - downloads and installs a compatible CUDA distribution
-* `sudo sh install-rocm.sh` - installs a compatible ROCm stack
-* `sudo sh install-hipsycl.sh` - installs hipSYCL.
-
-Unless you have a massive machine, you can expect this to run for half an eternity, so patience is a prerequisite for this installation approach. The easier way is to use our provided binary packages.
-The installation prefix can be changed using the environment variable `INSTALL_PREFIX` (default is `/opt/hipSYCL`). Note that the `install-hipsycl.sh` script builds hipSYCL with support for both CUDA and ROCm backends by default, which means you need to have both installed. If you wish to disable support for CUDA/ROCm, set the `HIPSYCL_WITH_CUDA` or `HIPSYCL_WITH_ROCM` environment variables to `OFF`.
-
-If you change the `INSTALL_PREFIX` to a directory that is writable by your user, `sudo` is not required.
-
-## Building a singularity container
-We also provide singularity definition files in order to create singularity container images. Building an image consists of building a writable base image and afterwards installing the dependencies and hipsycl into the container
-
-```
-singularity build --fakeroot --sandbox base-ubuntu-18.04.sif base-definitions/base-ubuntu-18.04.def
-```
-for Ubuntu 18.04. Once this image is built, you can start adding the dependencies
-```
-singularity exec hipsycl-ubuntu-18.04.def install-llvm.sh
-singularity exec hipsycl-ubuntu-18.04.def install-rocm.sh
-singularity exec hipsycl-ubuntu-18.04.def install-cuda.sh
-```
-Note that there are two type of installation scripts available at the moment the regular ones located in the `install/scripts/` directory, and scripts that use spack to install the dependencies located in `install/scripts/spack-install/`. The spack install scripts are well tested, therefore we recommend using those for the installation. The regular install scripts might need some changes to work flawlessly.
-
-## Pre-built singularity containers
-
-We provide pre-built singularity images for all supported distributions. The containers are available here: http://repo.urz.uni-heidelberg.de/sycl/singularity/ 
-
-The images are validated by building the hipSYCL unit tests for all supported backends, and running them for OpenMP and CUDA.
-
-Please note that due to legal reasons, the images do not contain the CUDA installation. Please use the `install/scripts/install-cuda.sh` script to install it afterwards. Note that this is only possible in case the container is writable; therefore we recommend installing CUDA by executing the following commands:
-
-```
-singularity shell build --sandbox --fakeroot <container_name>.sif <container_name>
-singularity exec --writable --fakeroot <container_name> bash install/scripts/install-cuda.sh
-```
-
-## Creating packages
-In order to create binary packages for your distribution, you will first have to create container images as described above. Then run (e.g., for Ubuntu):
-```
-cd packaging; singularity exec hipsycl-image.sif sh make-ubuntu-pkg.sh
-```
-This script will generate three packages:
-* `hipSYCL-base` contains the LLVM stack and clang compiler for hipSYCL. This package must always be installed
-* `hipSYCL-rocm` contains a ROCm stack. This must be installed if you wish to target ROCm
-* `hipSYCL` contains the actual hipSYCL libraries, headers and tools
-
-Creating CUDA packages is also possible, but this functionality is separate since we do not distribute CUDA binary packages for legal reasons. In order to create a CUDA package, just run the `make-ubuntu-cuda.sh` (for Ubuntu, analogously for other distributions) script. This script can be run on its own and does not require the building the entire stack including container image.
-Note: If you only intend to install hipSYCL's CUDA stack on a single machine for home use, it may be easier and faster to just install it directly using the install script: Run
-```
-sudo sh install-cuda.sh 
-```
-which will install it directly to `/opt/hipSYCL/cuda` where hipSYCL expects it.
diff --git a/install/scripts/add-hipsycl-repo/archlinux-rolling.sh b/install/scripts/add-hipsycl-repo/archlinux-rolling.sh
deleted file mode 100644
index 580b574a6..000000000
--- a/install/scripts/add-hipsycl-repo/archlinux-rolling.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash 
-set -o xtrace
-
-echo '[hipsycl]' >> /etc/pacman.conf
-echo "Server = http://repo.urz.uni-heidelberg.de/sycl${1}/archlinux/x86_64" >> /etc/pacman.conf
-
-pacman-key --init
-wget -q -O - http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc | pacman-key --add -
-pacman-key --lsign-key E967BA09716F870320089583E68CC4B9B2B75080
-pacman -Sy
-
-
diff --git a/install/scripts/add-hipsycl-repo/centos-7.sh b/install/scripts/add-hipsycl-repo/centos-7.sh
deleted file mode 100644
index 431818c4d..000000000
--- a/install/scripts/add-hipsycl-repo/centos-7.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash 
-yum update -y
-yum install epel-release -y
-yum install -y rpm-build sed wget curl patch 
-yum install centos-release-scl -y
-yum-config-manager --add-repo http://repo.urz.uni-heidelberg.de/sycl$1/rpm/centos7/hipsycl.repo
diff --git a/install/scripts/add-hipsycl-repo/ubuntu-18.04.sh b/install/scripts/add-hipsycl-repo/ubuntu-18.04.sh
deleted file mode 100644
index 2320fdcbb..000000000
--- a/install/scripts/add-hipsycl-repo/ubuntu-18.04.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-add-apt-repository -y ppa:ubuntu-toolchain-r/test
-echo "deb http://repo.urz.uni-heidelberg.de/sycl$1/deb/ ./bionic main" > /etc/apt/sources.list.d/hipsycl.list
-wget -q -O - http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc | apt-key add -
-apt update
-
diff --git a/install/scripts/add-hipsycl-repo/ubuntu-20.04.sh b/install/scripts/add-hipsycl-repo/ubuntu-20.04.sh
deleted file mode 100644
index ce2c66e8c..000000000
--- a/install/scripts/add-hipsycl-repo/ubuntu-20.04.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-export DEBIAN_FRONTEND=noninteractive
-echo "deb http://repo.urz.uni-heidelberg.de/sycl$1/deb/ ./focal main" > /etc/apt/sources.list.d/hipsycl.list
-wget -q -O - http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc | apt-key add -
-apt update
diff --git a/install/scripts/base-definitions/archlinux-rolling.def b/install/scripts/base-definitions/archlinux-rolling.def
deleted file mode 100644
index 5ca08ad3a..000000000
--- a/install/scripts/base-definitions/archlinux-rolling.def
+++ /dev/null
@@ -1,9 +0,0 @@
-BootStrap: docker
-From: archlinux:base
-
-%setup
-
-%post
-[ "$HIPSYCL_PKG_BUILD_CUDA" = "ON" ] && bash /install-cuda.sh || echo "Not building CUDA"
-pacman -Syu --noconfirm
-pacman -S --noconfirm unzip sed wget git python3 parallel tar perl base-devel cmake curl	
diff --git a/install/scripts/base-definitions/centos-7.def b/install/scripts/base-definitions/centos-7.def
deleted file mode 100644
index 565038c78..000000000
--- a/install/scripts/base-definitions/centos-7.def
+++ /dev/null
@@ -1,22 +0,0 @@
-BootStrap: docker
-From: centos:centos7
-
-%environment
-HIPSYCL_BASE_CC=gcc
-HIPSYCL_BASE_CXX=g++
-. /opt/rh/devtoolset-9/enable
-%setup
-
-%post
-yum update -y
-yum install epel-release -y
-yum install -y rpm-build sed unzip python34 python3 git parallel wget perl perl-Data-Dumper cmake3 curl patch 
-yum install centos-release-scl -y
-yum install devtoolset-9 -y
-yum install lbzip2 -y
-#We neeed proper cmake 
-yum remove cmake
-ln -s /usr/bin/cmake3 /usr/bin/cmake
-#bash /install-cuda.sh
-#bash /install-base-spack.sh
-
diff --git a/install/scripts/base-definitions/ubuntu-18.04.def b/install/scripts/base-definitions/ubuntu-18.04.def
deleted file mode 100644
index aa397b501..000000000
--- a/install/scripts/base-definitions/ubuntu-18.04.def
+++ /dev/null
@@ -1,16 +0,0 @@
-BootStrap: docker
-From: ubuntu:18.04
-
-%setup
-
-%post
-apt update
-apt install -y sed unzip wget gcc g++ git python3 parallel perl perl-modules cmake curl
-apt install -y software-properties-common
-apt install -y software-properties-common
-add-apt-repository -y ppa:ubuntu-toolchain-r/test
-apt -y install g++-9
-
-
-#bash /install-cuda.sh 
-#bash /install-base-spack.sh
diff --git a/install/scripts/hipsycl-archlinux-rolling.def b/install/scripts/hipsycl-archlinux-rolling.def
deleted file mode 100644
index d8f984163..000000000
--- a/install/scripts/hipsycl-archlinux-rolling.def
+++ /dev/null
@@ -1,8 +0,0 @@
-BootStrap: localimage
-From: base-archlinux-rolling.sif
-
-%setup
-cp ./install-hipsycl.sh ${SINGULARITY_ROOTFS}/install-hipsycl.sh
-
-%post
-sh /install-hipsycl.sh
diff --git a/install/scripts/hipsycl-centos-7.def b/install/scripts/hipsycl-centos-7.def
deleted file mode 100644
index 48f81cb40..000000000
--- a/install/scripts/hipsycl-centos-7.def
+++ /dev/null
@@ -1,10 +0,0 @@
-BootStrap: localimage
-From: base-centos-7.sif
-
-%environment
-source /opt/rh/devtoolset-7/enable
-%setup
-cp ./install-hipsycl.sh ${SINGULARITY_ROOTFS}/install-hipsycl.sh
-
-%post
-sh /install-hipsycl.sh
diff --git a/install/scripts/hipsycl-minimal-install.sh b/install/scripts/hipsycl-minimal-install.sh
deleted file mode 100644
index 5c3cbd1ac..000000000
--- a/install/scripts/hipsycl-minimal-install.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "This will install hipSYCL into the current directory in a VERY minimal configuration:"
-echo "The installation will only support CPU and no LLVM compiler acceleration of SYCL kernels."
-echo "For production use and performance, this may not be ideal, but if you just quickly want to have a SYCL implementation, it might be perfect :-)"
-echo ""
-echo "The only dependencies required are:"
-echo " * Your default system compiler must support C++17 and OpenMP"
-echo " * You need to have installed the boost.context and boost.fiber libraries, including development files (e.g. on Ubuntu, the libboost-all-dev package)."
-echo " * python 3"
-echo " * cmake"
-echo ""
-echo "Make sure these dependencies are satisfied and press enter to continue".
-read ARG
-
-
-rm -rf ./hipsycl-build
-mkdir -p ./hipsycl-build
-git clone https://github.com/illuhad/hipSYCL ./hipsycl-build
-mkdir -p ./hipsycl-build/build
-cd ./hipsycl-build/build
-cmake -DCMAKE_INSTALL_PREFIX=`pwd`/../.. -DWITH_CUDA_BACKEND=OFF -DWITH_ROCM_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF -DWITH_OPENCL_BACKEND=OFF -DACPP_COMPILER_FEATURE_PROFILE=none ..
-make install
diff --git a/install/scripts/hipsycl-ubuntu-18.04.def b/install/scripts/hipsycl-ubuntu-18.04.def
deleted file mode 100644
index 735962192..000000000
--- a/install/scripts/hipsycl-ubuntu-18.04.def
+++ /dev/null
@@ -1,8 +0,0 @@
-BootStrap: localimage
-From: base-ubuntu-18.04.sif
-
-%setup
-cp ./install-hipsycl.sh ${SINGULARITY_ROOTFS}/install-hipsycl.sh
-
-%post
-sh /install-hipsycl.sh
diff --git a/install/scripts/install-base-spack.sh b/install/scripts/install-base-spack.sh
deleted file mode 100644
index b88314a5c..000000000
--- a/install/scripts/install-base-spack.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-set -e
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-git clone https://github.com/spack/spack.git
-export SPACK_ROOT=/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/llvm/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 64|' spack/etc/spack/defaults/config.yaml
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-spack install llvm@$llvm_version libcxx=False 
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/boost/|' spack/etc/spack/defaults/config.yaml
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-spack install boost%clang@$llvm_version
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/rocm/|' spack/etc/spack/defaults/config.yaml
-find . | grep -E "(__pycache__|\.pyc|\.pyo$)" | xargs rm -rf
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-spack install hip%clang@$llvm_version
-
-
-
diff --git a/install/scripts/install-cuda.sh b/install/scripts/install-cuda.sh
deleted file mode 100644
index b742fc5ad..000000000
--- a/install/scripts/install-cuda.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-
-CUDA_INSTALLER_FILENAME=cuda_10.0.130_410.48_linux
-
-set -e
-cd /tmp
-if [ ! -f $CUDA_INSTALLER_FILENAME ]; then
-  wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/$CUDA_INSTALLER_FILENAME
-fi
-sh $CUDA_INSTALLER_FILENAME --override --silent --toolkit --toolkitpath $HIPSYCL_INSTALL_PREFIX/cuda
diff --git a/install/scripts/install-hipsycl.sh b/install/scripts/install-hipsycl.sh
deleted file mode 100644
index 2ca3d9e3e..000000000
--- a/install/scripts/install-hipsycl.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-
-set -e
-HIPSYCL_BUILD_DIR=${HIPSYCL_BUILD_DIR:-/tmp/hipSYCL-installer}
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-develop}
-HIPSYCL_WITH_CUDA=${HIPSYCL_WITH_CUDA:-ON}
-HIPSYCL_WITH_ROCM=${HIPSYCL_WITH_ROCM:-ON}
-HIPSYCL_LLVM_DIR=${HIPSYCL_LLVM_DIR:-/opt/hipSYCL/llvm/lib/}
-
-if [ -d "$HIPSYCL_BUILD_DIR" ]; then
-       read -p  "The build directory already exists, do you want to use $HIPSYCL_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-              echo "Please specify a different directory, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       else
-              echo "Using the exisiting directory"
-       fi
-else
-echo "Cloning hipSYCL"
-git clone --recurse-submodules -b $HIPSYCL_REPO_BRANCH https://github.com/$HIPSYCL_REPO_USER/hipSYCL $HIPSYCL_BUILD_DIR
-
-fi
-
-mkdir -p $HIPSYCL_BUILD_DIR/build
-cd $HIPSYCL_BUILD_DIR/build
-
-cmake \
--DWITH_CPU_BACKEND=ON \
--DWITH_CUDA_BACKEND=$HIPSYCL_WITH_CUDA \
--DWITH_ROCM_BACKEND=$HIPSYCL_WITH_ROCM \
--DLLVM_DIR=$HIPSYCL_LLVM_DIR \
--DROCM_PATH=$HIPSYCL_INSTALL_PREFIX/rocm \
--DCMAKE_INSTALL_PREFIX=$HIPSYCL_INSTALL_PREFIX \
-..
-
-make -j `nproc` install
diff --git a/install/scripts/install-llvm.sh b/install/scripts/install-llvm.sh
deleted file mode 100644
index babb28a52..000000000
--- a/install/scripts/install-llvm.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-1}
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-llvmorg-${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.${HIPSYCL_PKG_LLVM_VERSION_MINOR}.${HIPSYCL_PKG_LLVM_VERSION_PATCH}}
-
-HIPSYCL_PKG_LLVM_VERSION=${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.${HIPSYCL_PKG_LLVM_VERSION_MINOR}.${HIPSYCL_PKG_LLVM_VERSION_PATCH}
-
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-release/9.x}
-HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-HIPSYCL_LLVM_BUILD_DIR=${HIPSYCL_LLVM_BUILD_DIR:-$HOME/git/llvm-vanilla}
-
-
-set -e
-if [ -d "$HIPSYCL_LLVM_BUILD_DIR" ]; then
-       read -p  "The build directory already exists, do you want to use $HIPSYCL_LLVM_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ $REPLY =~ ^[Yy]$ ]]; then
-              echo "Using the exisiting directory"
-       else
-              echo "Please specify a different directory, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       fi
-else
-
-echo "Cloning LLVM $HIPSYCL_PKG_LLVM_REPO_BRANCH"
-git clone -b $HIPSYCL_PKG_LLVM_REPO_BRANCH https://github.com/llvm/llvm-project $HIPSYCL_LLVM_BUILD_DIR
-fi
-
-case $HIPSYCL_PKG_LLVM_VERSION in
-	9.0.1)
-		echo "Applying patch on $HIPSYCL_PKG_LLVM_VERSION"
-		sed -i 's/CHECK_SIZE_AND_OFFSET(ipc_perm, mode);//g' $HIPSYCL_LLVM_BUILD_DIR/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
-		;;
-esac
-
-
-CC=${HIPSYCL_BASE_CC:-clang}
-CXX=${HIPSYCL_BASE_CXX:-clang++}
-BUILD_TYPE=Release
-HIPSYCL_LLVM_INSTALL_PREFIX=$INSTALL_PREFIX/llvm
-TARGETS_TO_BUILD="AMDGPU;NVPTX;X86"
-NUMTHREADS=`nproc`
-
-CMAKE_OPTIONS="-DLLVM_ENABLE_PROJECTS=clang;compiler-rt;lld;openmp \
-               -DOPENMP_ENABLE_LIBOMPTARGET=OFF \
-               -DCMAKE_C_COMPILER=$CC \
-               -DCMAKE_CXX_COMPILER=$CXX \
-               -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-               -DCMAKE_INSTALL_PREFIX=$HIPSYCL_LLVM_INSTALL_PREFIX \
-               -DLLVM_ENABLE_ASSERTIONS=OFF \
-               -DLLVM_TARGETS_TO_BUILD=$TARGETS_TO_BUILD \
-               -DCLANG_ANALYZER_ENABLE_Z3_SOLVER=0 \
-               -DLLVM_INCLUDE_BENCHMARKS=0 \
-               -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \
-               -DCMAKE_INSTALL_RPATH=$HIPSYCL_LLVM_INSTALL_PREFIX/lib \
-               -DLLVM_ENABLE_OCAMLDOC=OFF \
-               -DLLVM_ENABLE_BINDINGS=OFF \
-               -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=OFF \
-               -DLLVM_ENABLE_DUMP=OFF"
-
-mkdir -p $HIPSYCL_LLVM_BUILD_DIR/build
-cd $HIPSYCL_LLVM_BUILD_DIR/build
-cmake $CMAKE_OPTIONS $HIPSYCL_LLVM_BUILD_DIR/llvm
-make -j $NUMTHREADS
-make install
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/llvm-lit   $HIPSYCL_LLVM_INSTALL_PREFIX/bin/llvm-lit
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/FileCheck  $HIPSYCL_LLVM_INSTALL_PREFIX/bin/FileCheck
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/count      $HIPSYCL_LLVM_INSTALL_PREFIX/bin/count
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/not        $HIPSYCL_LLVM_INSTALL_PREFIX/bin/not
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/yaml-bench $HIPSYCL_LLVM_INSTALL_PREFIX/yaml-bench
diff --git a/install/scripts/install-rocm.sh b/install/scripts/install-rocm.sh
deleted file mode 100644
index 67c882a36..000000000
--- a/install/scripts/install-rocm.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-
-HIPSYCL_PKG_AOMP_RELEASE=${HIPSYCL_PKG_AOMP_VERSION:-0.7-7}
-HIPSYCL_PKG_AOMP_TAG=${HIPSYCL_PKG_AOMP_TAG:-rel_${HIPSYCL_PKG_AOMP_RELEASE}}
-
-set -e
-HIPSYCL_ROCM_BUILD_DIR=${HIPSYCL_ROCM_BUILD_DIR:-$HOME/git/aomp}
-
-export CC=${HIPSYCL_BASE_CC:-clang}
-export CXX=${HIPSYCL_BASE_CXX:-clang++}
-export SUDO=${SUDO:-"disable"}
-export AOMP=$HIPSYCL_INSTALL_PREFIX/rocm
-export BUILD_TYPE=Release
-#export NVPTXGPUS=60,61,62,70
-#export AOMP_BUILD_HIPSYCL_ESSENTIAL=1
-export AOMP_BUILD_HIP=1
-export CUDA=${CUDA:-$HIPSYCL_INSTALL_PREFIX/cuda}
-#export AOMP_BUILD_CUDA=1
-
-if [ -d "$HIPSYCL_ROCM_BUILD_DIR" ]; then
-       read -p  "The build directory already exists, do you want to use $HIPSYCL_ROCM_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ $REPLY =~ ^[Yy]$ ]]; then
-              echo "Using the exisiting directory"
-       else
-              echo "Please specify a different directory, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       fi
-else
-echo "Cloning aomp"
-git clone -b $HIPSYCL_PKG_AOMP_TAG https://github.com/ROCm-Developer-Tools/aomp $HIPSYCL_ROCM_BUILD_DIR/aomp
-cd $HIPSYCL_ROCM_BUILD_DIR/aomp/bin
-./clone_aomp.sh
-fi
-
-
-cd $HIPSYCL_ROCM_BUILD_DIR/aomp/bin
-case $HIPSYCL_PKG_AOMP_RELEASE in
-	0.7-7)
-    sed -i 's/openmp pgmath flang flang_runtime//g' $HIPSYCL_ROCM_BUILD_DIR/aomp/bin/build_aomp.sh
-    sed -i 's/exit 1//g' $HIPSYCL_ROCM_BUILD_DIR/aomp/bin/build_hcc.sh
-    # This aomp patch to support HIP in conjunction with OpenMP breaks HIP clang printf,
-    # so we remove it
-    sed -i 's/patch -p1 < $thisdir\/hip.patch//g' $HIPSYCL_ROCM_BUILD_DIR/aomp/bin/build_hip.sh
-
-    # Remove problematic -Werror compilation arguments
-    sed -i 's/ -Werror//g' $HIPSYCL_ROCM_BUILD_DIR/aomp-extras/hostcall/lib/CMakeLists.txt
-    sed -i 's/ -Werror//g' $HIPSYCL_ROCM_BUILD_DIR/rocr-runtime/src/CMakeLists.txt
-
-    # Remove for compatibility with glibc 2.31
-    sed -i 's/CHECK_SIZE_AND_OFFSET(ipc_perm, mode);//g' $HIPSYCL_ROCM_BUILD_DIR/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
-    sed -i 's/CHECK_SIZE_AND_OFFSET(ipc_perm, mode);//g' $HIPSYCL_ROCM_BUILD_DIR/hcc/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
-  ;;
-esac
-./build_aomp.sh
diff --git a/install/scripts/packaging/common/init.sh b/install/scripts/packaging/common/init.sh
deleted file mode 100644
index 6670c89ae..000000000
--- a/install/scripts/packaging/common/init.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-# define variables - version and build paths
-HIPSYCL_VERSION=24.02.0
-HIPSYCL_BUILD=`date +%Y%m%d`
-HIPSYCL_VERSION_STRING=${HIPSYCL_VERSION}-${HIPSYCL_BUILD}
-HIPSYCL_GPG_KEY=${HIPSYCL_GPG_KEY:-B2B75080}
-
-#BUILD_DIR=`mktemp -d`
-BUILD_DIR=${HIPSYCL_PACKAGING_DIR:-/tmp/hipsycl-packages}
-
-#Base packages
-CUDA_PKG=hipSYCL-base-cuda-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-ROCM_PKG=hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-COMMON_PKG=hipSYCL-base-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-
-#hipSYCL packages
-HIPSYCL_CORE_PKG=hipSYCL-core-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_CUDA_PKG=hipSYCL-cuda-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_ROCM_PKG=hipSYCL-rocm-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_OMP_PKG=hipSYCL-omp-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-
-#Meta packages
-HIPSYCL_META_PKG=hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_FULL_PKG=hipSYCL-full-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-
-
-echo "Building packages in directory ${BUILD_DIR}..."
-
-export CUDA_DIR=${BUILD_DIR}/${CUDA_PKG}
-export ROCM_DIR=${BUILD_DIR}/${ROCM_PKG}
-export COMMON_DIR=${BUILD_DIR}/${COMMON_PKG}
-
-export HIPSYCL_CORE_DIR=${BUILD_DIR}/${HIPSYCL_CORE_PKG}
-export HIPSYCL_CUDA_DIR=${BUILD_DIR}/${HIPSYCL_CUDA_PKG}
-export HIPSYCL_ROCM_DIR=${BUILD_DIR}/${HIPSYCL_ROCM_PKG}
-export HIPSYCL_OMP_DIR=${BUILD_DIR}/${HIPSYCL_OMP_PKG}
-
-export HIPSYCL_META_DIR=${BUILD_DIR}/${HIPSYCL_META_PKG}
-export HIPSYCL_FULL_DIR=${BUILD_DIR}/${HIPSYCL_FULL_PKG}
-
-# Make sure there are no residual files
-# from previous builds
-rm -rf ${CUDA_DIR}/opt || true
-rm -rf ${ROCM_DIR}/opt || true
-rm -rf ${COMMON_DIR}/opt || true
-
-rm -rf ${HIPSYCL_CORE_DIR} || true
-rm -rf ${HIPSYCL_CUDA_DIR} || true
-rm -rf ${HIPSYCL_ROCM_DIR} || true
-rm -rf ${HIPSYCL_OMP_DIR} || true
-
-# create build directories
-mkdir -p ${CUDA_DIR}/opt/hipSYCL/cuda
-mkdir -p ${ROCM_DIR}/opt/hipSYCL/rocm
-mkdir -p ${COMMON_DIR}/opt/hipSYCL
-
-mkdir -p ${HIPSYCL_CORE_DIR}/opt/hipSYCL
-mkdir -p ${HIPSYCL_CUDA_DIR}/opt/hipSYCL/lib/hipSYCL
-mkdir -p ${HIPSYCL_ROCM_DIR}/opt/hipSYCL/lib/hipSYCL
-mkdir -p ${HIPSYCL_OMP_DIR}/opt/hipSYCL/lib/hipSYCL
-
-
-# sort installed binaries into build paths
-cp -R /opt/hipSYCL/rocm/* ${ROCM_DIR}/opt/hipSYCL/rocm || true
-cp -R /opt/hipSYCL/llvm ${COMMON_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/boost ${COMMON_DIR}/opt/hipSYCL || true
-
-cp -R /opt/hipSYCL/bin     ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/etc     ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/include ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/lib     ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-rm -rf ${HIPSYCL_CORE_DIR}/opt/hipSYCL/lib/hipSYCL/* || true
-
-cp  /opt/hipSYCL/lib/hipSYCL/librt-backend-cuda.so ${HIPSYCL_CUDA_DIR}/opt/hipSYCL/lib/hipSYCL || true
-cp  /opt/hipSYCL/lib/hipSYCL/librt-backend-hip.so ${HIPSYCL_ROCM_DIR}/opt/hipSYCL/lib/hipSYCL || true
-cp  /opt/hipSYCL/lib/hipSYCL/librt-backend-omp.so ${HIPSYCL_OMP_DIR}/opt/hipSYCL/lib/hipSYCL || true
-
diff --git a/install/scripts/packaging/make-archlinux-cuda-pkg.sh b/install/scripts/packaging/make-archlinux-cuda-pkg.sh
deleted file mode 100644
index d61e6f233..000000000
--- a/install/scripts/packaging/make-archlinux-cuda-pkg.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -e
-
-. ./common/init.sh
-
-mkdir -p ${CUDA_DIR}/pkg
-cp ../install-cuda.sh ${CUDA_DIR}/pkg/
-
-
-cat << EOF > ${CUDA_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-cuda
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="CUDA stack for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('NVIDIA CUDA EULA')
-depends=('hipSYCL')
-source=('install-cuda.sh')
-md5sums=()
-validpgpkeys=()
-
-package(){
-  INSTALL_PREFIX=\$pkgdir/opt/hipSYCL sh ./install-cuda.sh
-}
-
-EOF
-
-cd ${CUDA_DIR}/pkg && makepkg -d -c --skipinteg
-
-
diff --git a/install/scripts/packaging/make-archlinux-pkg.sh b/install/scripts/packaging/make-archlinux-pkg.sh
deleted file mode 100644
index 178a46a81..000000000
--- a/install/scripts/packaging/make-archlinux-pkg.sh
+++ /dev/null
@@ -1,239 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-echo $HIPSYCL_GPG_KEY
-if [ -n "$HIPSYCL_GPG_KEY" ]; then
-	SIGN=" --sign --key $HIPSYCL_GPG_KEY"
-fi
-
-tar -cvf ${BUILD_DIR}/cuda-pkg.tar.gz -C ${CUDA_DIR} opt/
-tar -cvf ${BUILD_DIR}/rocm-pkg.tar.gz -C ${ROCM_DIR} opt/
-tar -cvf ${BUILD_DIR}/common-pkg.tar.gz -C ${COMMON_DIR} opt/
-
-tar -cvf ${BUILD_DIR}/hipsycl-core-pkg.tar.gz -C ${HIPSYCL_CORE_DIR} opt/
-tar -cvf ${BUILD_DIR}/hipsycl-cuda-pkg.tar.gz -C ${HIPSYCL_CUDA_DIR} opt/
-tar -cvf ${BUILD_DIR}/hipsycl-rocm-pkg.tar.gz -C ${HIPSYCL_ROCM_DIR} opt/
-tar -cvf ${BUILD_DIR}/hipsycl-omp-pkg.tar.gz -C ${HIPSYCL_OMP_DIR}  opt/
-
-mkdir -p ${CUDA_DIR}/pkg
-mkdir -p ${ROCM_DIR}/pkg
-mkdir -p ${COMMON_DIR}/pkg
-
-mkdir -p ${HIPSYCL_CORE_DIR}/pkg
-mkdir -p ${HIPSYCL_CUDA_DIR}/pkg
-mkdir -p ${HIPSYCL_ROCM_DIR}/pkg
-mkdir -p ${HIPSYCL_OMP_DIR}/pkg
-
-mkdir -p ${HIPSYCL_FULL_DIR}/pkg
-mkdir -p ${HIPSYCL_META_DIR}/pkg
-
-mv ${BUILD_DIR}/cuda-pkg.tar.gz ${CUDA_DIR}/pkg/
-mv ${BUILD_DIR}/rocm-pkg.tar.gz ${ROCM_DIR}/pkg/
-mv ${BUILD_DIR}/common-pkg.tar.gz ${COMMON_DIR}/pkg/
-
-mv ${BUILD_DIR}/hipsycl-core-pkg.tar.gz ${HIPSYCL_CORE_DIR}/pkg
-mv ${BUILD_DIR}/hipsycl-cuda-pkg.tar.gz ${HIPSYCL_CUDA_DIR}/pkg
-mv ${BUILD_DIR}/hipsycl-rocm-pkg.tar.gz ${HIPSYCL_ROCM_DIR}/pkg
-mv ${BUILD_DIR}/hipsycl-omp-pkg.tar.gz ${HIPSYCL_OMP_DIR}/pkg
-
-
-
-cat << EOF > ${HIPSYCL_CORE_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-core-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=('hipSYCL-omp-${HIPSYCL_PKG_TYPE}' 'python' )
-provides=('hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-core-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_CUDA_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="cuda backend for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=( 'hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-provides=('hipSYCL-cuda-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-cuda-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_ROCM_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-rocm-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="rocm backend for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=('hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}' 'hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-provides=('hipSYCL-rocm-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-rocm-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_OMP_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-omp-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="omp backend for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=('hipSYCL-base-${HIPSYCL_PKG_TYPE}' 'hipSYCL-core-${HIPSYCL_PKG_TYPE}'  )
-provides=('hipSYCL-omp-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-omp-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${COMMON_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-base-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="LLVM compiler stack for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=('numactl')
-source=('common-pkg.tar.gz')
-md5sums=()
-validpgpkeys=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${ROCM_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="ROCm compiler stack and libraries for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=( 'pciutils' 'libelf' 'perl' 'pkg-config')
-provides=('hipSYCL-${HIPSYCL_PKG_TYPE}' 'SYCL-${HIPSYCL_PKG_TYPE}')
-source=('rocm-pkg.tar.gz')
-md5sums=()
-validpgpkeys=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_FULL_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-full-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=( 'hipSYCL-${HIPSYCL_PKG_TYPE}' )
-provides=( 'hipSYCL-full-${HIPSYCL_PKG_TYPE}' )
-
-EOF
-
-cat << EOF > ${HIPSYCL_META_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=( 'hipSYCL-cuda-${HIPSYCL_PKG_TYPE}' 'hipSYCL-rocm-${HIPSYCL_PKG_TYPE}' 'hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-provides=( 'hipSYCL-${HIPSYCL_PKG_TYPE}' )
-
-EOF
-
-cat << EOF > ${CUDA_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="CUDA stack for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('NVIDIA CUDA EULA')
-depends=()
-provides=('cuda')
-source=('cuda-pkg.tar.gz')
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON" ]; then
-cd ${HIPSYCL_CORE_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_CUDA_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_ROCM_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_OMP_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-
-cd ${HIPSYCL_META_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_FULL_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON" ]; then
-cd ${COMMON_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON" ]; then
-cd ${ROCM_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON" ]; then
-cd ${CUDA_DIR}/pkg && makepkg -d -c --skipinteg $SIGN
-echo $HIPSYCL_PKG_BUILD_CUDA
-fi
diff --git a/install/scripts/packaging/make-centos-7-pkg.sh b/install/scripts/packaging/make-centos-7-pkg.sh
deleted file mode 100644
index 0289d359b..000000000
--- a/install/scripts/packaging/make-centos-7-pkg.sh
+++ /dev/null
@@ -1,271 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-RPM_ROOT=${BUILD_DIR}/rpm
-mkdir -p ${RPM_ROOT}/{SOURCES,BUILD,RPMS,SPECS,SRPMS,tmp}
-
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-core-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-omp-${HIPSYCL_PKG_TYPE} 
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CORE_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/bin
-/opt/hipSYCL/lib
-/opt/hipSYCL/include
-/opt/hipSYCL/etc
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-Summary: cuda backend for hipSYCL
-Name: hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-Summary: rocm backend for hipSYCL
-Name: hipSYCL-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_ROCM_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-Summary: omp backend for hipSYCL
-Name: hipSYCL-omp-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_OMP_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-rocm-${HIPSYCL_PKG_TYPE},  hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-full-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base.spec
-Summary: base LLVM compiler stack for hipSYCL
-Name: hipSYCL-base-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-base-${HIPSYCL_VERSION_STRING}
-Requires: devtoolset-9, binutils, lbzip2
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${COMMON_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/llvm
-/opt/hipSYCL/boost
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base-rocm.spec
-Summary: ROCm stack for hipSYCL
-Name: hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-rocm-${HIPSYCL_VERSION_STRING}
-Requires: numactl-devel, numactl-libs, pciutils-devel, pciutils-libs, perl, elfutils-libelf-devel
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${ROCM_DIR}/* %{buildroot}
-  
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/rocm
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda.spec
-Summary: CUDA stack for hipSYCL
-Name: hipSYCL-cuda
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: NVIDIA CUDA EULA
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-cuda-${HIPSYCL_VERSION_STRING}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/cuda
-
-EOF
-
-
-cd ${RPM_ROOT}/SPECS
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON"  ]; then
-rpmbuild -bb hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-
-rpmbuild -bb hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON"  ]; then
-rpmbuild -bb hipSYCL-base.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON"  ]; then
-rpmbuild -bb hipSYCL-base-rocm.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON"  ]; then
-rpmbuild -D '%_python_bytecompile_errors_terminate_build 0' -bb hipSYCL-cuda.spec
-fi
diff --git a/install/scripts/packaging/make-centos-8-pkg.sh b/install/scripts/packaging/make-centos-8-pkg.sh
deleted file mode 100644
index b4d706284..000000000
--- a/install/scripts/packaging/make-centos-8-pkg.sh
+++ /dev/null
@@ -1,282 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-RPM_ROOT=${BUILD_DIR}/rpm
-mkdir -p ${RPM_ROOT}/{SOURCES,BUILD,RPMS,SPECS,SRPMS,tmp}
-
-# We need to use %undefine __brp_mangle_shebangs
-# since llvm contains ambigous python shebangs 
-# Probably fixing these here is not the best idea
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-core-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-omp-${HIPSYCL_PKG_TYPE} 
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CORE_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/bin
-/opt/hipSYCL/lib
-/opt/hipSYCL/include
-/opt/hipSYCL/etc
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-Summary: cuda backend for hipSYCL
-Name: hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-Summary: rocm backend for hipSYCL
-Name: hipSYCL-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_ROCM_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-Summary: omp backend for hipSYCL
-Name: hipSYCL-omp-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_OMP_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-rocm-${HIPSYCL_PKG_TYPE},  hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-full-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base.spec
-Summary: base LLVM compiler stack for hipSYCL
-Name: hipSYCL-base-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-base-${HIPSYCL_VERSION_STRING}
-Requires: binutils, lbzip2, gcc-toolset-9-toolchain
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${COMMON_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/llvm
-/opt/hipSYCL/boost
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base-rocm.spec
-Summary: ROCm stack for hipSYCL
-Name: hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-rocm-${HIPSYCL_VERSION_STRING}
-Requires: numactl-devel, numactl-libs, pciutils-devel, pciutils-libs, perl, elfutils-libelf-devel
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${ROCM_DIR}/* %{buildroot}
-  
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/rocm
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda.spec
-Summary: CUDA stack for hipSYCL
-Name: hipSYCL-cuda
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: NVIDIA CUDA EULA
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-cuda-${HIPSYCL_VERSION_STRING}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/cuda
-
-EOF
-
-
-cd ${RPM_ROOT}/SPECS
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-base.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-base-rocm.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -D '%_python_bytecompile_errors_terminate_build 0' -bb hipSYCL-cuda.spec
-fi
diff --git a/install/scripts/packaging/make-centos-cuda-pkg.sh b/install/scripts/packaging/make-centos-cuda-pkg.sh
deleted file mode 100644
index 52e345fa5..000000000
--- a/install/scripts/packaging/make-centos-cuda-pkg.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-
-RPM_ROOT=${BUILD_DIR}/rpm
-mkdir -p ${RPM_ROOT}/{SOURCES,BUILD,RPMS,SPECS,SRPMS,tmp}
-
-rm -rf ${CUDA_DIR}/*
-INSTALL_PREFIX=${CUDA_DIR}/opt/hipSYCL sh ../install-cuda.sh
-rm -rf ${CUDA_DIR}/opt/hipSYCL/cuda/samples
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda.spec
-Summary: CUDA stack for hipSYCL
-Name: hipSYCL-cuda
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: NVIDIA CUDA EULA
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-cuda-${HIPSYCL_VERSION_STRING}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${CUDA_DIR}/* %{buildroot}
-
-
-%files
-/opt/hipSYCL/cuda
-
-EOF
-
-
-cd ${RPM_ROOT}/SPECS
-rpmbuild -D '%_python_bytecompile_errors_terminate_build 0' -bb hipSYCL-cuda.spec
-
diff --git a/install/scripts/packaging/make-ubuntu-cuda-pkg.sh b/install/scripts/packaging/make-ubuntu-cuda-pkg.sh
deleted file mode 100644
index c96a2d045..000000000
--- a/install/scripts/packaging/make-ubuntu-cuda-pkg.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-. ./common/init.sh
-
-mkdir -p ${CUDA_DIR}/DEBIAN
-
-cat << EOF > ${CUDA_DIR}/DEBIAN/control
-Package: hipsycl-cuda
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Depends: hipSYCL
-Description: CUDA stack for hipSYCL
- Provides CUDA toolkit for hipSYCL
-EOF
-
-INSTALL_PREFIX=${CUDA_DIR}/opt/hipSYCL sh ../install-cuda.sh
-
-cd ${BUILD_DIR}
-dpkg-deb --build ${CUDA_PKG}
-
diff --git a/install/scripts/packaging/make-ubuntu-pkg.sh b/install/scripts/packaging/make-ubuntu-pkg.sh
deleted file mode 100644
index 833e02d9f..000000000
--- a/install/scripts/packaging/make-ubuntu-pkg.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-
-mkdir -p ${CUDA_DIR}/DEBIAN
-mkdir -p ${ROCM_DIR}/DEBIAN
-mkdir -p ${COMMON_DIR}/DEBIAN
-
-mkdir -p ${HIPSYCL_CORE_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_CUDA_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_ROCM_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_OMP_DIR}/DEBIAN
-
-mkdir -p ${HIPSYCL_META_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_FULL_DIR}/DEBIAN
-
-cat << EOF > ${HIPSYCL_CORE_DIR}/DEBIAN/control 
-Package: hipsycl-core-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-omp-${HIPSYCL_PKG_TYPE}, python3 (>= 3.0)
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
- Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs 
-EOF
-
-cat << EOF > ${HIPSYCL_CUDA_DIR}/DEBIAN/control 
-Package: hipsycl-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-core-${HIPSYCL_PKG_TYPE}
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
-  Cuda backend for hipSYCL
-EOF
-
-cat << EOF > ${HIPSYCL_ROCM_DIR}/DEBIAN/control 
-Package: hipsycl-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-core-${HIPSYCL_PKG_TYPE}, hipsycl-base-rocm-${HIPSYCL_PKG_TYPE} , python3 (>= 3.0)
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
-  Rocm backend for hipSYCL
-EOF
-
-cat << EOF > ${HIPSYCL_OMP_DIR}/DEBIAN/control 
-Package: hipsycl-omp-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-core-${HIPSYCL_PKG_TYPE}, hipsycl-base-${HIPSYCL_PKG_TYPE} , python3 (>= 3.0)
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
-  omp backend for hipSYCL
-EOF
-
-cat << EOF > ${COMMON_DIR}/DEBIAN/control
-Package: hipsycl-base-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: g++-9, libnuma1, build-essential
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL base compiler stack
- Provides an LLVM compiler stack for hipSYCL
-EOF
-
-cat << EOF > ${ROCM_DIR}/DEBIAN/control
-Package: hipsycl-base-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends:  libpci-dev
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: ROCm compiler stack for hipSYCL-${HIPSYCL_PKG_TYPE}  Provides ROCm libraries for hipSYCL
-EOF
-
-
-cat << EOF > ${HIPSYCL_FULL_DIR}/DEBIAN/control
-Package: hipsycl-full-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-rocm-${HIPSYCL_PKG_TYPE},  hipsycl-cuda-${HIPSYCL_PKG_TYPE}
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description:  Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs 
-
-EOF
-
-cat << EOF > ${HIPSYCL_META_DIR}/DEBIAN/control
-Package: hipsycl-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-full-${HIPSYCL_PKG_TYPE}
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description:  Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs 
-
-EOF
-
-cat << EOF > ${CUDA_DIR}/DEBIAN/control
-Package: hipsycl-base-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: CUDA stack for hipSYCL
- Provides CUDA toolkit for hipSYCL
-EOF
-
-cd ${BUILD_DIR}
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON" ]; then
-dpkg-deb --build ${ROCM_PKG}
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON"  ]; then
-dpkg-deb --build ${COMMON_PKG}
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON" ]; then
-
-dpkg-deb --build ${HIPSYCL_CORE_DIR} 
-dpkg-deb --build ${HIPSYCL_CUDA_DIR} 
-dpkg-deb --build ${HIPSYCL_ROCM_DIR} 
-dpkg-deb --build ${HIPSYCL_OMP_DIR} 
-
-dpkg-deb --build ${HIPSYCL_META_PKG}
-dpkg-deb --build ${HIPSYCL_FULL_PKG}
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON" ]; then
-dpkg-deb --build ${CUDA_PKG}
-fi
-
diff --git a/install/scripts/rebuild-images.sh b/install/scripts/rebuild-images.sh
deleted file mode 100644
index 2cdfb9a17..000000000
--- a/install/scripts/rebuild-images.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-
-if [ "$#" -ne 2 ]; then
-  echo "
-  Usage: <distro> <[script to run] OR build>
-    distro: the distro to install the software for
-    script_to_run: Execute the install script located in HIPSYCL_PKG_SCRIPT_DIR, ( by defauld install/scripts)
-    build: Build the base image into the HIPSYCL_PKG_CONTAINER_DIR folder. the scripts that are necessary are copid to the image
-        during the build time the definition file is located at: HIPSYCL_PKG_SCRIPT_DIR/base-<distro>.def
-
-  Important ENV variables:
-    - HIPSYCL_PKG_CONTAINER_DIR
-  "
-  exit -1
-fi
-distro=$1
-candidate=$2
-
-HIPSYCL_PKG_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-HIPSYCL_PKG_CONTAINER_DIR=${HIPSYCL_PKG_CONTAINER_DIR:-$HIPSYCL_PKG_SCRIPT_DIR/containers}
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-release/9.x}
-echo $HIPSYCL_PKG_CONTAINER_DIR
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-
-SINGULARITYENV_HIPSYCL_PKG_BUILD_CUDA=$HIPSYCL_PKG_BUILD_CUDA
-SINGULARITYENV_HIPSYCL_PKG_BUILD_ROCM=$HIPSYCL_PKG_BUILD_ROCM
-SINGULARITYENV_HIPSYCL_PKG_BUILD_BASE=$HIPSYCL_PKG_BUILD_BASE
-SINGULARITYENV_HIPSYCL_PKG_LLVM_REPO_BRANCH=$HIPSYCL_PKG_LLVM_REPO_BRANCH
-SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MAJOR=$HIPSYCL_PKG_LLVM_VERSION_MAJOR
-SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MINOR=$HIPSYCL_PKG_LLVM_VERSION_MINOR
-SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_PATCH=$HIPSYCL_PKG_LLVM_VERSION_PATCH
-SINGULARITYENV_HIPSYCL_PKG_AOMP_RELEASE=$HIPSYCL_PKG_AOMP_RELEASE
-SINGULARITYENV_HIPSYCL_PKG_AOMP_TAG=$HIPSYCL_PKG_AOMP_TAG
-SINGULARITYENV_HIPSYCL_BUILD_DIR_PREFIX="/tmp/hipsycl-build-for-"
-SINGULARITYENV_HIPSYCL_BUILD_DIR=$SINGULARITYENV_HIPSYCL_BUILD_DIR_PREFIX$distro
-
-export SINGULARITYENV_HIPSYCL_PKG_BUILD_CUDA
-export SINGULARITYENV_HIPSYCL_PKG_BUILD_ROCM
-export SINGULARITYENV_HIPSYCL_PKG_BUILD_BASE
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_REPO_BRANCH 
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MAJOR
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MINOR 
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_PATCH 
-export SINGULARITYENV_HIPSYCL_PKG_AOMP_RELEASE
-export SINGULARITYENV_HIPSYCL_PKG_AOMP_TAG
-export SINGULARITYENV_HIPSYCL_BUILD_DIR
-
-
-echo $HIPSYCL_PKG_CONTAINER_DIR
-cd $HIPSYCL_PKG_SCRIPT_DIR
-mkdir -p $HIPSYCL_PKG_CONTAINER_DIR
-mkdir -p /tmp/hipsycl-pkg-builder
-
-if [ "$candidate" = "build" ]; then
-  echo "Building $distro image... with base pkgs"
-  singularity build --fakeroot --sandbox -F $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro base-definitions/$distro.def
-  echo "Building $distro hipSYCL base via spack"
-elif [ "$candidate" = "cleanup" ]; then
-  rm -rf $SINGULARITYENV_HIPSYCL_BUILD_DIR
-else
-  #echo $HIPSYCL_PKG_LLVM_VERSION_MAJOR
-  tmpdir=$HIPSYCL_PKG_CONTAINER_DIR/tmp-$distro
-  mkdir -p $tmpdir
-  singularity -d exec --fakeroot --writable --no-home  -B $HIPSYCL_PKG_SCRIPT_DIR:/mnt \
-  $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro \
-  bash /mnt/$candidate.sh
-fi
diff --git a/install/scripts/spack-install/boost.sh b/install/scripts/spack-install/boost.sh
deleted file mode 100644
index 5943aee11..000000000
--- a/install/scripts/spack-install/boost.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-if [ ! -d ./spack ]; then
-  git clone https://github.com/spack/spack.git #-b v0.16.1
-fi
-export SPACK_ROOT=/root/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/boost/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 16|' spack/etc/spack/defaults/config.yaml
-. $SPACK_ROOT/share/spack/setup-env.sh
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-# Spack distributed build in this form causes Timeouts sometimes.... maybe use a upstream solution... yeah probably.... 
-
-parallel --joblog /tmp/spack-install-boost.exit --lb -N0 spack install boost%clang@$llvm_version context=True fiber=True target=x86_64 cxxstd=11 ::: {1..16} || error=1
-if [ "$error" = "1" ]; then 
-  spack install boost%clang@$llvm_version context=True fiber=True target=x86_64 cxxstd=11
-fi
-spack gc -y
-
diff --git a/install/scripts/spack-install/cuda.sh b/install/scripts/spack-install/cuda.sh
deleted file mode 120000
index cb5a0b63e..000000000
--- a/install/scripts/spack-install/cuda.sh
+++ /dev/null
@@ -1 +0,0 @@
-../install-cuda.sh
\ No newline at end of file
diff --git a/install/scripts/spack-install/hipsycl.sh b/install/scripts/spack-install/hipsycl.sh
deleted file mode 100644
index 9872251f3..000000000
--- a/install/scripts/spack-install/hipsycl.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-release/${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.x}
-
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL/}
-
-set -e
-HIPSYCL_BUILD_DIR=${HIPSYCL_BUILD_DIR:-/tmp/hipsycl-installer-hipsyclbuildbot}
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-develop}
-HIPSYCL_WITH_CUDA=${HIPSYCL_WITH_CUDA:-ON}
-HIPSYCL_WITH_ROCM=${HIPSYCL_WITH_ROCM:-ON}
-
-LLVM_INCLUDE_PATH=$HIPSYCL_INSTALL_PREFIX/llvm/llvm/lib/clang/${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.\
-${HIPSYCL_PKG_LLVM_VERSION_MINOR}.\
-${HIPSYCL_PKG_LLVM_VERSION_PATCH}/include
-if [ -d "$HIPSYCL_BUILD_DIR" ]; then
-       read -p  "hipsycl_installer: The build directory already exists, do you want to use $HIPSYCL_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-              echo "hipsycl_installer: Please specify a different directory than $HIPSYCL_BUILD_DIR, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       else
-              echo "hipsycl_installer: Using the exisiting directory"
-       fi
-else
-echo "hipsycl_installer: Cloning hipSYCL"
-git clone --recurse-submodules -b $HIPSYCL_REPO_BRANCH https://github.com/$HIPSYCL_REPO_USER/hipSYCL $HIPSYCL_BUILD_DIR
-
-fi
-
-mkdir -p $HIPSYCL_BUILD_DIR/build
-cd $HIPSYCL_BUILD_DIR/build
-
-# We need the llvm module to be loaded in order to be able to find the openmp rt
-export SPACK_ROOT=/root/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-
-source /etc/profile
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/llvm/|' $SPACK_ROOT/etc/spack/defaults/config.yaml
-spack load --only package llvm
-rocm_path=/opt/hipSYCL/rocm/
-
-cmake \
--DCMAKE_C_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang \
--DCMAKE_CXX_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang++ \
--DWITH_CPU_BACKEND=ON \
--DWITH_CUDA_BACKEND=$HIPSYCL_WITH_CUDA \
--DWITH_ROCM_BACKEND=$HIPSYCL_WITH_ROCM \
--DLLVM_DIR=/opt/hipSYCL/llvm/llvm/ \
--DROCM_PATH=/opt/hipSYCL/rocm/hip/ \
--DBOOST_ROOT=/opt/hipSYCL/boost/boost/ \
--DCUDA_TOOLKIT_ROOT_DIR=/opt/hipSYCL/cuda \
--DCLANG_EXECUTABLE_PATH=/opt/hipSYCL/llvm/llvm/bin/clang++ \
--DCLANG_INCLUDE_PATH=$LLVM_INCLUDE_PATH \
--DCMAKE_INSTALL_PREFIX=$HIPSYCL_INSTALL_PREFIX \
--DCMAKE_PREFIX_PATH="$rocm_path/comgr/lib/cmake;$rocm_path/rocm-device-libs/lib/cmake;$rocm_path/hsa-rocr-dev/lib/cmake;$rocm_path/hsa-rocr-dev/;$rocm_path/hip/lib/cmake" \
-..
-
-make -j 16 install
-cp /mnt/spack-install/spack-syclcc.json /opt/hipSYCL/etc/hipSYCL/syclcc.json 
diff --git a/install/scripts/spack-install/llvm.sh b/install/scripts/spack-install/llvm.sh
deleted file mode 100644
index ec1e705fa..000000000
--- a/install/scripts/spack-install/llvm.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-if [ ! -d ./spack ]; then
-  git clone https://github.com/spack/spack.git #-b v0.16.1
-  # git clone https://github.com/spack/spack.git spack_upstream
-  # sed -i 's|- $spack/var/spack/repos/builtin|- /root/spack_upstream/var/spack/repos/builtin|' spack/etc/spack/defaults/repos.yaml
-fi
-export SPACK_ROOT=/root/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-. $SPACK_ROOT/share/spack/setup-env.sh
-spack compiler find || echo "No new compilers"; spack compilers
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/llvm/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 16|' spack/etc/spack/defaults/config.yaml
-sed -i 's|projects.append("clang-tools-extra")|#projects.append("clang-tools-extra")|' spack/var/spack/repos/builtin/packages/llvm/package.py
-
-. $SPACK_ROOT/share/spack/setup-env.sh
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-parallel --lb -N0 spack install llvm@$llvm_version cuda=False libcxx=True polly=False lldb=False lld=True internal_unwind=False gold=False target=x86_64 build_type=MinSizeRel -flang ::: {1..16} || error=1
-if [ "$error" = "1" ]; then 
-  spack install llvm@$llvm_version -flang cuda=False libcxx=True polly=False lldb=False lld=True internal_unwind=False gold=False target=x86_64 build_type=MinSizeRel 
-fi
-#spack install llvm@$llvm_version cuda=False libcxx=False target=x86_64
-spack load llvm
-spack compiler find /opt/hipSYCL/llvm/llvm/
-spack unload llvm
-
diff --git a/install/scripts/spack-install/rocm.sh b/install/scripts/spack-install/rocm.sh
deleted file mode 100644
index 7623aa752..000000000
--- a/install/scripts/spack-install/rocm.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-HIPSYCL_HIP_VERSION=${HIPSYCL_HIP_VERSION:-4.0.0}
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-if [ ! -d ./spack ]; then
-  git clone https://github.com/spack/spack.git #-b v0.16.1
-  # git clone https://github.com/spack/spack.git spack_upstream
-  # echo "upstreams:
-  # spack-instance-1:
-  #   install_tree: /root/spack_upstream/opt/spack" > /root/spack/etc/spack/defaults/upstreams.yaml
-fi
-export SPACK_ROOT=~/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/rocm/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 16|' spack/etc/spack/defaults/config.yaml
-. $SPACK_ROOT/share/spack/setup-env.sh
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-
-# Somteimes some parallel instances exit due to waiting too long for a lock
-# In case that happens we run the sequential version to check if everything have been
-# installed properly
-parallel --lb -N0 spack install hip@$HIPSYCL_HIP_VERSION%clang@$llvm_version target=x86_64 ::: {1..16} || error="1"
-if [ "$error" = "1" ]; then 
-  spack install hip%clang@$llvm_version target=x86_64
-fi
-spack gc -y
-
diff --git a/install/scripts/spack-install/spack-syclcc.json b/install/scripts/spack-install/spack-syclcc.json
deleted file mode 100644
index 2216abd94..000000000
--- a/install/scripts/spack-install/spack-syclcc.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "default-clang"     : "/opt/hipSYCL/llvm/llvm/bin/clang++",
-  "default-platform"  : "cuda",
-  "default-cuda-path" : "/opt/hipSYCL/cuda",
-  "default-gpu-arch"  : "",
-  "default-cpu-cxx"   : "/opt/hipSYCL/llvm/llvm/bin/clang++",
-  "default-rocm-path" : "/opt/hipSYCL/rocm",
-  "default-use-bootstrap-mode" : "false",
-  "default-is-dryrun" : "false",
-  "default-clang-include-path" : "/opt/hipSYCL/llvm/llvm/lib/clang/11.0.0/include/..",
-  "default-sequential-link-line" : "-L/opt/hipSYCL/boost/boost/lib -lboost_context -lboost_fiber -lomp  -Wl,-rpath=/opt/hipSYCL/boost/boost/lib",
-  "default-sequential-cxx-flags" : "-I/opt/hipSYCL/boost/boost/include",
-  "default-omp-link-line" : "-L/opt/hipSYCL/boost/boost/lib -lboost_context -lboost_fiber -Wl,-rpath=/opt/hipSYCL/boost/boost/lib -Wl,-rpath=/opt/hipSYCL/llvm/llvm/lib -fopenmp",
-  "default-omp-cxx-flags" : "-I/opt/hipSYCL/boost/boost/include -fopenmp",
-  "default-rocm-link-line" : "-Wl,-rpath=$HIPSYCL_ROCM_PATH/lib -Wl,-rpath=$HIPSYCL_ROCM_PATH/hip/lib -Wl,-rpath=/opt/hipSYCL/llvm/llvm/lib -L/opt/hipSYCL/rocm/lib -L/opt/hipSYCL/rocm/hip/lib -lamdhip64",
-  "default-rocm-cxx-flags" : "-isystem /opt/hipSYCL/llvm/llvm/lib/clang/11.0.0/include/.. -U__FLOAT128__ -U__SIZEOF_FLOAT128__ -I$HIPSYCL_ROCM_PATH/hsa-rocr-dev/include -I$HIPSYCL_ROCM_PATH/hip/include --rocm-path=$HIPSYCL_ROCM_PATH/rocm-device-libs",
-  "default-cuda-link-line" : "-Wl,-rpath=$HIPSYCL_CUDA_LIB_PATH -Wl,-rpath=/opt/hipSYCL/llvm/llvm/lib -L$HIPSYCL_CUDA_LIB_PATH -lcudart",
-  "default-cuda-cxx-flags" : "-U__FLOAT128__ -U__SIZEOF_FLOAT128__"
-}

From ceb6a9de69940630651d477e391f6832774820fb Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 7 Dec 2024 03:56:08 +0100
Subject: [PATCH 093/126] Remove outdated packaging infrastructure

---
 devops/repos/README.md                        | 49 ----------
 devops/repos/common/init.sh                   | 82 ----------------
 devops/repos/create_pkgs.sh                   | 50 ----------
 devops/repos/create_repos.sh                  | 22 -----
 devops/repos/create_singularity_containers.sh | 12 ---
 .../base-archlinux-rolling.def                |  8 --
 .../base-centos-7.def                         | 11 ---
 .../base-ubuntu-18.04.def                     | 13 ---
 .../archlinux-rolling.def                     |  9 --
 .../definitions-test-containers/centos-7.def  | 17 ----
 .../ubuntu-18.04.def                          | 13 ---
 .../ubuntu-20.04.def                          | 12 ---
 devops/repos/publish_test_container.sh        | 31 ------
 devops/repos/record_env_vars.sh               | 19 ----
 .../repo-creation-scripts/create_arch_repo.sh | 19 ----
 .../create_centos_repo.sh                     | 28 ------
 .../create_ubuntu_repo.sh                     | 42 --------
 .../hipsycl-centos-7.repo                     |  7 --
 devops/repos/test-installation.sh             | 85 ----------------
 devops/repos/test-packages.sh                 | 93 ------------------
 devops/repos/update_repo.sh                   | 98 -------------------
 21 files changed, 720 deletions(-)
 delete mode 100644 devops/repos/README.md
 delete mode 100644 devops/repos/common/init.sh
 delete mode 100644 devops/repos/create_pkgs.sh
 delete mode 100644 devops/repos/create_repos.sh
 delete mode 100644 devops/repos/create_singularity_containers.sh
 delete mode 100644 devops/repos/definitions-packaging-container/base-archlinux-rolling.def
 delete mode 100644 devops/repos/definitions-packaging-container/base-centos-7.def
 delete mode 100644 devops/repos/definitions-packaging-container/base-ubuntu-18.04.def
 delete mode 100644 devops/repos/definitions-test-containers/archlinux-rolling.def
 delete mode 100644 devops/repos/definitions-test-containers/centos-7.def
 delete mode 100644 devops/repos/definitions-test-containers/ubuntu-18.04.def
 delete mode 100644 devops/repos/definitions-test-containers/ubuntu-20.04.def
 delete mode 100644 devops/repos/publish_test_container.sh
 delete mode 100644 devops/repos/record_env_vars.sh
 delete mode 100644 devops/repos/repo-creation-scripts/create_arch_repo.sh
 delete mode 100644 devops/repos/repo-creation-scripts/create_centos_repo.sh
 delete mode 100644 devops/repos/repo-creation-scripts/create_ubuntu_repo.sh
 delete mode 100644 devops/repos/repo-creation-scripts/hipsycl-centos-7.repo
 delete mode 100755 devops/repos/test-installation.sh
 delete mode 100755 devops/repos/test-packages.sh
 delete mode 100644 devops/repos/update_repo.sh

diff --git a/devops/repos/README.md b/devops/repos/README.md
deleted file mode 100644
index 8c4bab79f..000000000
--- a/devops/repos/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# hipSYCL packaging system
-
-Currently the packaging is based around three groups of bash scripts bound together by the `update-repos.sh`, and the `common/init.sh scripts`. We aimed for having most of these scripts available for use separately from the packaging system, and to serve as inspiration.
-
-The three logical groups are installation, package creation, repository creation, and testing. installation and package creation scripts are located in the `install/scripts` directory, repo creation and testing scripts are located in the `devops/repos directory`.
-
-We provide a high level overview of the different functions here please refer to the actual scripts for more detail
-
-## update_repo.sh
-
-This script serves as a wrapper around the different other scripts that are responsible for building packaging and testing. It is usefulness lies in creating a access point for all the functions that are scattered among the different directories.
-
-## record_env_vars.sh
-
-Creates the `~/envs.out` file, based on the current environment.
-
-## create_pkgs.sh
-
-Executes the packaging script for a distro. and moves the finished packages to the staging folder. It has two modes, `hipsycl` and `base` the former only builds the hipSYCL packages later only builds the base packages
-
-## create_repos.sh
-
-Executes the repo creation for a distribution.
-
-## test-packages.sh
-
-Handles testing of the built and deployed packages for a certain backend configuration.
-
-## test-installation.sh
-
-Run tests on a singularity container containing hipSYCLs
-
-## Examples
-
-```
-bash update_repo.sh centos-7 build_base build              # Build base container
-bash update_repo.sh centos-7 build_base spack-install/rocm # Install rocm into base container
-bash update_repo.sh centos-7 build_base spack-install/llvm # Install llvm
-bash update_repo.sh centos-7 package base                  # create base packages for rocm and llvm&boost
-bash update_repo.sh centos-7 package hipsycl               # create hipsycl packages
-bash update_repo.sh centos-7 deploy                        # deploy packages
-bash update_repo.sh centos-7 test 00                       # run build, add_repo install_dep run_test for the test
-bash update_repo.sh centos-7 test 00 build                 # build testing container
-bash update_repo.sh centos-7 test 00 add_repo               # Add hipSYCL repo to testing container
-bash update_repo.sh centos-7 pub_cont                      # Publish containers 
-```
-
-
- 
diff --git a/devops/repos/common/init.sh b/devops/repos/common/init.sh
deleted file mode 100644
index 651043850..000000000
--- a/devops/repos/common/init.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-HIPSYCL_PKG_REPO_STAGE_DIR=${HIPSYCL_PKG_REPO_STAGE_DIR:-./stage}
-export HIPSYCL_GPG_KEY=E967BA09716F870320089583E68CC4B9B2B75080
-
-#Testing
-declare -A install_cmd=( ["archlinux-rolling"]="pacman -Sy --noconfirm hipSYCL" \
-                         ["centos-7"]="yum -y install hipSYCL" \
-                         ["centos-8"]="yum -y install hipSYCL" \
-                         ["ubuntu-18.04"]="apt -y install hipsycl" \
-                         ["ubuntu-20.04"]="apt -y install hipsycl"
-                       )
-
-declare -A cleanup_cmd=( ["archlinux-rolling"]="pacman -Rsn --noconfirm hipSYCL" \
-                         ["centos-7"]="yum -y remove hipSYCL" \
-                         ["centos-8"]="yum -y remove hipSYCL" \
-                         ["ubuntu-18.04"]="apt -y remove hipsycl" \
-                         ["ubuntu-20.04"]="apt -y remove hipsycl"
-                       )
-
-declare -A cleanup_dep=( ["archlinux-rolling"]='pacman -Rsn --noconfirm $(pacman -Qdtq)' \
-                         ["centos-7"]="package-cleanup -y --leaves" \
-                         ["centos-8"]="package-cleanup -y --leaves" \
-                         ["ubuntu-18.04"]="apt -y autoremove" \
-                         ["ubuntu-20.04"]="apt -y autoremove"
-                       )
-
-
-declare -A image_base=( ["archlinux-rolling"]="docker://archlinux:base" \
-                         ["centos-7"]="docker://centos:centos7" \
-                         ["centos-8"]="docker://centos:centos8" \
-                         ["ubuntu-18.04"]="docker://ubuntu:18.04" \
-                         ["ubuntu-20.04"]="docker://ubuntu:20.04" 
-                      )
-
-declare  -A pkg_suffix=( ["ONON"]="-full" ["OFFOFF"]="-omp" ["OFFON"]="-cuda" \
-                         ["ONOFF"]="-rocm")
-
-#Packging
-
-
-declare -A find_built_pkg=( ["archlinux-rolling"]='4.pkg.tar' \
-                            ["centos-7"]='4.rpm'  \
-                            ["centos-8"]='4.rpm'  \
-                            ["ubuntu-18.04"]='\.deb' \
-                      )
-declare -A packaging_script=( ["archlinux-rolling"]="make-archlinux-pkg.sh"    \
-                              ["centos-7"]="make-centos-7-pkg.sh"  \
-                              ["centos-8"]="make-centos-8-pkg.sh"  \
-                              ["ubuntu-18.04"]="make-ubuntu-pkg.sh"  \
-                      )
-declare -A packaging_image=( ["archlinux-rolling"]="archlinux-rolling"    \
-                              ["centos-7"]="centos-7"  \
-                              ["centos-8"]="centos-8"  \
-                              ["ubuntu-18.04"]="ubuntu-18.04"  \
-                              )
-
-declare -A stage_dir=( ["archlinux-rolling"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_arch"    \
-                       ["centos-7"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_centos-7"  \
-                       ["centos-8"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_centos-8"  \
-                       ["ubuntu-18.04"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_ubuntu"  \
-                      )
-
-#Repo creation
-declare -A repo_tools_cont=( ["archlinux-rolling"]="arch.sif" \
-                             ["centos-7"]="centos-7.sif" \
-                             ["centos-8"]="centos-7.sif" \
-                             ["ubuntu-18.04"]="ubuntu-18.04.sif" \
-                             ["ubuntu-20.04"]="ubuntu-18.04.sif"
-                       )
-
-declare -A repo_script=( ["archlinux-rolling"]="create_arch_repo.sh" \
-                             ["centos-7"]="create_centos_repo.sh centos-7" \
-                             ["centos-8"]="create_centos_repo.sh centos-8" \
-                             ["ubuntu-18.04"]="create_ubuntu_repo.sh bionic" \
-                             ["ubuntu-20.04"]="create_ubuntu_repo.sh focal"
-                       )
-
-
-
-#distros=( "centos-7" "archlinux-rolling" "ubuntu-18.04" "ubuntu-20.04")
-#build_distros=( "centos-7" "archlinux-rolling" "ubuntu-18.04" ) 
diff --git a/devops/repos/create_pkgs.sh b/devops/repos/create_pkgs.sh
deleted file mode 100644
index 345331dac..000000000
--- a/devops/repos/create_pkgs.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-if [ "$#" -lt 1 ]; then
-  echo "
-  Responsible for creating the packages from the built containers where the hipSYCL stack is installed.
-  Currently there are two modes supported hipsycl and base. hipsycl will only build the base packages, base will build all the 
-  base packages, (rocm, base,)
-  Usage: <distro> <mode>
-  "
-  exit -1
-fi
-distro=$1
-option=${2:-"hipsycl"}
-
-
-HIPSYCL_PKG_BUILD_ROCM=ON
-HIPSYCL_PKG_BUILD_BASE=ON
-HIPSYCL_PKG_BUILD_HIPSYCL=ON
-
-if [ "$option" = "hipsycl" ]; then
-  HIPSYCL_PKG_BUILD_BASE=OFF 
-  HIPSYCL_PKG_BUILD_ROCM=OFF
-elif [ "$option" = "base" ]; then
-  HIPSYCL_PKG_BUILD_HIPSYCL=OFF
-fi
-
-export HIPSYCL_PKG_BUILD_ROCM
-export HIPSYCL_PKG_BUILD_BASE
-export HIPSYCL_PKG_BUILD_HIPSYCL
-
-source $HIPSYCL_PKG_DEVOPS_DIR/common/init.sh
-
-HIPSYCL_PKG_DEVOPS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-HIPSYCL_PKG_SCRIPT_DIR=${HIPSYCL_PKG_SCRIPT_DIR:-../../install/scripts/}
-HIPSYCL_PKG_SCRIPT_DIR_ABS=$HIPSYCL_PKG_DEVOPS_DIR/$HIPSYCL_PKG_SCRIPT_DIR
-export HIPSYCL_PACKAGING_DIR="/tmp/hipsycl-packages-$distro"
-cd $HIPSYCL_PKG_SCRIPT_DIR_ABS/packaging
-
-export SINGULARITYENV_HIPSYCL_PACKAGING_DIR=$HIPSYCL_PACKAGING_DIR
-
-stage_dir=${stage_dir[$distro]}
-
-singularity exec $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-${packaging_image[$distro]} bash ${packaging_script[$distro]}
-
-mkdir -p $HIPSYCL_PKG_DEVOPS_DIR/$stage_dir
-for file in `find /tmp/hipsycl-packages-$distro | grep ${find_built_pkg[$distro]}`; do
-  mv $file $HIPSYCL_PKG_DEVOPS_DIR/$stage_dir/
-done
-rm -rf $SINGULARITYENV_HIPSYCL_PACKAGING_DIR
\ No newline at end of file
diff --git a/devops/repos/create_repos.sh b/devops/repos/create_repos.sh
deleted file mode 100644
index 9f263ed96..000000000
--- a/devops/repos/create_repos.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-set -o xtrace
-distro=$1
-if [ -z $1 ]; then
-  echo "Provide the name of the distro as the first command line argument"
-  exit -1
-fi
-
-HIPSYCL_PKG_DEVOPS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-source $HIPSYCL_PKG_DEVOPS_DIR/common/init.sh
-
-cd $HIPSYCL_PKG_DEVOPS_DIR
-SINGULARITY_BASE_DIR=${SINGULARITY_BASE_DIR:-./containers}
-HIPSYCL_PKG_REPO_BASE_DIR=${HIPSYCL_PKG_REPO_BASE_DIR:-/data/repos}
-HIPSYCL_PKG_SCRIPT_DIR=${HIPSYCL_PKG_SCRIPT_DIR:-$HIPSYCL_PKG_DEVOPS_DIR/repo-creation-scripts}
-mkdir -p $HIPSYCL_PKG_REPO_BASE_DIR
-
-echo "$HIPSYCL_PKG_REPO_BASE_DIR"
-singularity -d exec --fakeroot -B $HIPSYCL_PKG_REPO_BASE_DIR:/data/repos/ -B $HIPSYCL_PKG_DEVOPS_DIR:$HIPSYCL_PKG_DEVOPS_DIR \
-     $SINGULARITY_BASE_DIR/${repo_tools_cont[$distro]} bash $HIPSYCL_PKG_SCRIPT_DIR/${repo_script[$distro]}
-     
\ No newline at end of file
diff --git a/devops/repos/create_singularity_containers.sh b/devops/repos/create_singularity_containers.sh
deleted file mode 100644
index 4d75802e8..000000000
--- a/devops/repos/create_singularity_containers.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-SINGULARITY_BASE_DIR=${SINGULARITY_BASE_DIR:-./containers/}
-SINGULARITY_DEF_DIR=${SINGULARTIY_DEF_DIR:-./definitions-packaging-container/}
-
-singularity build --fakreoot  $SINGULARITY_BASE_DIR/centos-7.sif  \
-	 $SINGULARITY_DEF_DIR/base-centos-7.def
-
-singularity build --fakreoot $SINGULARITY_BASE_DIR/ubuntu-18.04.sif  \
-  	 $SINGULARITY_DEF_DIR/base-ubuntu-18.04.def
-
-singularity build --fakreoot $SINGULARITY_BASE_DIR/arch.sif    \
-	 $SINGULARITY_DEF_DIR/base-archlinux-rolling.def
diff --git a/devops/repos/definitions-packaging-container/base-archlinux-rolling.def b/devops/repos/definitions-packaging-container/base-archlinux-rolling.def
deleted file mode 100644
index a250d545b..000000000
--- a/devops/repos/definitions-packaging-container/base-archlinux-rolling.def
+++ /dev/null
@@ -1,8 +0,0 @@
-BootStrap: docker
-From: archlinux:base
-
-%setup
-
-%post
-pacman -Sy --noconfirm
-pacman -S --noconfirm grep
diff --git a/devops/repos/definitions-packaging-container/base-centos-7.def b/devops/repos/definitions-packaging-container/base-centos-7.def
deleted file mode 100644
index 76620d7dd..000000000
--- a/devops/repos/definitions-packaging-container/base-centos-7.def
+++ /dev/null
@@ -1,11 +0,0 @@
-BootStrap: docker
-From: centos:centos7
-
-%setup
-
-%files
-
-%environment
-
-%post
-yum -y install gpg vim wget createrepo rpm-sign
diff --git a/devops/repos/definitions-packaging-container/base-ubuntu-18.04.def b/devops/repos/definitions-packaging-container/base-ubuntu-18.04.def
deleted file mode 100644
index f7c8c8b8a..000000000
--- a/devops/repos/definitions-packaging-container/base-ubuntu-18.04.def
+++ /dev/null
@@ -1,13 +0,0 @@
-BootStrap: docker
-From: ubuntu:18.04
-
-%setup
-
-%files
-
-%environment
-
-%post
-apt -y update
-apt -y install dpkg-dev dpkg-sig apt-utils
-
diff --git a/devops/repos/definitions-test-containers/archlinux-rolling.def b/devops/repos/definitions-test-containers/archlinux-rolling.def
deleted file mode 100644
index b84390f14..000000000
--- a/devops/repos/definitions-test-containers/archlinux-rolling.def
+++ /dev/null
@@ -1,9 +0,0 @@
-BootStrap: docker
-From: archlinux:base
-
-%setup
-
-%post
-pacman -Syu --noconfirm
-pacman -Sy --noconfirm awk wget make base-devel
-
diff --git a/devops/repos/definitions-test-containers/centos-7.def b/devops/repos/definitions-test-containers/centos-7.def
deleted file mode 100644
index f754087b5..000000000
--- a/devops/repos/definitions-test-containers/centos-7.def
+++ /dev/null
@@ -1,17 +0,0 @@
-BootStrap: docker
-From: centos:centos7
-
-%setup
-
-%files
-
-%environment
-HIPSYCL_BASE_CC=gcc
-HIPSYCL_BASE_CXX=g++
-. /opt/rh/devtoolset-9/enable
-
-%post
-yum update -y
-yum install epel-release -y
-yum install -y rpm-build sed wget curl patch 
-yum install centos-release-scl -y
\ No newline at end of file
diff --git a/devops/repos/definitions-test-containers/ubuntu-18.04.def b/devops/repos/definitions-test-containers/ubuntu-18.04.def
deleted file mode 100644
index 4ec4fafa4..000000000
--- a/devops/repos/definitions-test-containers/ubuntu-18.04.def
+++ /dev/null
@@ -1,13 +0,0 @@
-BootStrap: docker
-From: ubuntu:18.04
-
-%setup
-
-%files
-
-%environment
-
-%post
-apt update -y 
-apt install -y wget gawk gnupg apt-utils build-essential
-apt install -y software-properties-common
\ No newline at end of file
diff --git a/devops/repos/definitions-test-containers/ubuntu-20.04.def b/devops/repos/definitions-test-containers/ubuntu-20.04.def
deleted file mode 100644
index 9a650f5ab..000000000
--- a/devops/repos/definitions-test-containers/ubuntu-20.04.def
+++ /dev/null
@@ -1,12 +0,0 @@
-BootStrap: docker
-From: ubuntu:20.04
-
-%setup
-
-%files
-
-%environment
-
-%post
-apt update -y 
-apt install -y wget gawk gnupg apt-utils build-essential
\ No newline at end of file
diff --git a/devops/repos/publish_test_container.sh b/devops/repos/publish_test_container.sh
deleted file mode 100644
index 5268b8758..000000000
--- a/devops/repos/publish_test_container.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-set -o xtrace
-set -e
-distro=$1
-cd $HIPSYCL_PKG_DEVOPS_DIR
-commit_hash=`git rev-parse --short HEAD`
-cd -
-date=`date -u +"%Y%m%d"`
-supported_backends="omp"
-HIPSYCL_PKG_PUBLIC_CONTAINER_DIR=${HIPSYCL_PKG_PUBLIC_CONTAINER_DIR:-/data/repos/singularity/}
-HIPSYCL_TEST_DIR=${HIPSYCL_TEST_DIR:-"/data/hipsyclbot/test-dir"}
-mkdir -p $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR
-for backend in `ls $HIPSYCL_TEST_DIR | sed -n -e "s/^.*$distro-//p"`; do
-    if [[ ${backend:0:1} = "1" ]]; then supported_backends="${supported_backends}-rocm"; fi
-    if [[ ${backend:1:2} = "1" ]]; then supported_backends="${supported_backends}-cuda"; fi
-    container_name_base="hipSYCL-${HIPSYCL_PKG_TYPE}-${distro}-${supported_backends}"
-    container_name="${container_name_base}-${date}-${commit_hash}.sif"
-    singularity exec --fakeroot --writable $HIPSYCL_TEST_DIR/hipsycl-$distro-$backend rm -rf /opt/hipSYCL/cuda
-    #On arch these sockets cause a error while packing the container
-    singularity exec --fakeroot --writable $HIPSYCL_TEST_DIR/hipsycl-$distro-$backend rm -rf /etc/pacman.d/gnupg/S.gpg-agent.browser \
-            /etc/pacman.d/gnupg/S.gpg-agent.ssh /etc/pacman.d/gnupg/S.gpg-agent.extra  /etc/pacman.d/gnupg/S.gpg-agent
-    singularity build --force --fakeroot $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR/$container_name $HIPSYCL_TEST_DIR/hipsycl-$distro-$backend
-    supported_backends="omp"
-    #Keep only the two latest container from each kind
-    ls -t $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR | grep $container_name_base-[0-9] | tail -n +3 | xargs -I '_' rm -rf $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR/_
-done
-
-
-
-
-
diff --git a/devops/repos/record_env_vars.sh b/devops/repos/record_env_vars.sh
deleted file mode 100644
index 9e8cbd480..000000000
--- a/devops/repos/record_env_vars.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# This small script is needed for the workflow to work
-# since it is currently not possible to build containers
-# inside containers, we break out from the container containing
-# the workflow to a separate user on our server
-# 
-# The GitHub Action sets the variables in its container
-# we use this script to record the variables
-# and then we copy the variables in a sourceable form 
-# to the user where the actual building will happen.
-
-rm -rf envs.out
-touch envs.out
-for env in `env | grep HIPSYCL`; do
-  echo "export $env" >> envs.out 
-done
-
-
diff --git a/devops/repos/repo-creation-scripts/create_arch_repo.sh b/devops/repos/repo-creation-scripts/create_arch_repo.sh
deleted file mode 100644
index 0a6f3ab73..000000000
--- a/devops/repos/repo-creation-scripts/create_arch_repo.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-
-# We assume that the packages are already signed
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source $DIR/../common/init.sh
-ARCH_REPO_DIR=/data/repos/archlinux/x86_64/
-
-mkdir -p $ARCH_REPO_DIR
-
-cd ${stage_dir["archlinux-rolling"]}
-for f in *.tar.zst
-do
-	mv $f $ARCH_REPO_DIR
-	mv $f.sig $ARCH_REPO_DIR
-	repo-add --sign -k B2B75080 $ARCH_REPO_DIR/hipsycl.db.tar $ARCH_REPO_DIR/$f
-done
-
diff --git a/devops/repos/repo-creation-scripts/create_centos_repo.sh b/devops/repos/repo-creation-scripts/create_centos_repo.sh
deleted file mode 100644
index cec5d77ae..000000000
--- a/devops/repos/repo-creation-scripts/create_centos_repo.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-if [ "$#" -ne "1" ]; then
-	echo "Please specifiy the distro (centos7 or cnetos8) as first argument"
-fi
-distro=$1
-
-declare -A repo_dir=( ["centos-7"]="centos7" \
-                      ["centos-8"]="centos8" \
-					  )
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source $DIR/../common/init.sh
-CENTOS_REPO_DIR=/data/repos/rpm/${repo_dir[$distro]}
-mkdir -p $CENTOS_REPO_DIR
-cd ${stage_dir[$distro]}
-echo $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-for f in *
-do
-	echo $f
-	mv $f $CENTOS_REPO_DIR
-	echo "" | setsid rpmsign --addsign $CENTOS_REPO_DIR/$f
-done
-createrepo $CENTOS_REPO_DIR
-cp $DIR/hipsycl-$distro.repo $CENTOS_REPO_DIR/hipsycl.repo
-echo $DIR
-echo $CENTOS_REPO_DIR
-sed "s|sycl{}|sycl$HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX|" $DIR/hipsycl-$distro.repo > $CENTOS_REPO_DIR/hipsycl.repo
diff --git a/devops/repos/repo-creation-scripts/create_ubuntu_repo.sh b/devops/repos/repo-creation-scripts/create_ubuntu_repo.sh
deleted file mode 100644
index 88cd31af9..000000000
--- a/devops/repos/repo-creation-scripts/create_ubuntu_repo.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source $DIR/../common/init.sh
-
-UBUNTU_REPO_DIR=${UBUNTU_REPO_DIR:-/data/repos/deb}
-DIST=${1:-bionic}
-
-PKG_PATH=$UBUNTU_REPO_DIR/dists/$DIST/main/binary-amd64/
-RELEASE_PATH=$UBUNTU_REPO_DIR/dists/$DIST/
-POOL_PATH=$UBUNTU_REPO_DIR/pool/
-
-mkdir -p $PKG_PATH
-mkdir -p $POOL_PATH
-
-cd ${stage_dir["ubuntu-18.04"]}
-
-for f in *
-do
-	echo $f
-	set +e
-	mv $f $POOL_PATH
-	set -e
-done
-cd $UBUNTU_REPO_DIR 
-# we need the relative path because it will write it directly in Packages
-apt-ftparchive  packages ./pool >  $PKG_PATH/Packages
-
-cd $PKG_PATH
-gzip -k -f $PKG_PATH/Packages || true
-cd $RELEASE_PATH
-echo `pwd`
-apt-ftparchive release .  | tee $RELEASE_PATH/Release
-
-echo `pwd`
-rm -f Release.gpg
-rm -f InRelease
-gpg --batch --no-tty --default-key B2B75080 -abs -o Release.gpg Release
-gpg --batch --no-tty --default-key B2B75080 --clearsign -o InRelease Release
-
diff --git a/devops/repos/repo-creation-scripts/hipsycl-centos-7.repo b/devops/repos/repo-creation-scripts/hipsycl-centos-7.repo
deleted file mode 100644
index 581b507b0..000000000
--- a/devops/repos/repo-creation-scripts/hipsycl-centos-7.repo
+++ /dev/null
@@ -1,7 +0,0 @@
-[hipSYCL-repository]
-name=hipSYCL repository
-baseurl=http://repo.urz.uni-heidelberg.de/sycl{}/rpm/centos7/
-gpgcheck=1
-gpgkey=http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc
-enabled=1
-
diff --git a/devops/repos/test-installation.sh b/devops/repos/test-installation.sh
deleted file mode 100755
index 714f977b9..000000000
--- a/devops/repos/test-installation.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash 
-set -e 
-set -o xtrace
-set -xv
-if [ "$#" -ne 3 ]; then
-  echo "
-  This script is responsible for testing the installation inside a built container containing a hipSYCL installation
-  the following tests are built and executed: sycl_tests
-  
-  
-  usage:
-   <dir_of_test_script> <distro> <backend>
-  dir_of_test_scripts: Points to the directory where this script is located
-  distro: The distribution for which the packages are suposed to be tested
-  backend: A bitmask of the enabled backends, from leat to most important bit: CUDA,ROCM. 1 means enabled 0 means disabled
-  
-  Important ENV variables:
-    - HIPSYCL_TEST_DIR: The location where the test containers will be installed
-    - HIPSYCL_TEST_EXCLUDE_FROM_RT: by default set to hip:gfx900. For this backend, we only build the tests.
-  "
-  exit -1
-fi
-cd $1
-distro=$2
-backend=$3
-HIPSYCL_WITH_CUDA="OFF" 
-HIPSYCL_WITH_ROCM="OFF"
-if [[ ${backend:0:1} = "1" ]]; then HIPSYCL_WITH_ROCM="ON"; else HIPSYCL_WITH_ROCM="OFF"; fi
-if [[ ${backend:1:2} = "1" ]]; then HIPSYCL_WITH_CUDA="ON"; else HIPSYCL_WITH_CUDA="OFF"; fi
-source ./common/init.sh
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-
-cmake_path=/opt/hipSYCL/llvm/cmake/bin/cmake
-HIPSYCL_TEST_LOG_DIR=${HIPSYCL_TEST_LOG_DIR:-/tmp/hipsycl-logs}
-mkdir -p $HIPSYCL_TEST_LOG_DIR
-HIPSYCL_TEST_CUDA_ARCH=${HIPSYCL_TEST_CUDA_ARCH:-sm_61}
-HIPSYCL_TEST_ROCM_ARCH=${HIPSYCL_TEST_ROCM_ARCH:-gfx900}
-
-log_file=${log_file:-$HIPSYCL_TEST_LOG_DIR/hipSYCL_image_test-$current_time}
-touch $log_file
-slurm_out=${slurm_out:-$log_file}
-
-targets=( "omp" )
-[ "$HIPSYCL_WITH_CUDA" = "ON" ] && targets+=( "cuda:$HIPSYCL_TEST_CUDA_ARCH" )
-[ "$HIPSYCL_WITH_ROCM" = "ON" ] && targets+=( "hip:$HIPSYCL_TEST_ROCM_ARCH" )
-
-
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-develop}
-
-echo "Testing hipSYCL singularity images at $HIPSYCL_PKG_CONTAINER_DIR for targets ${targets[*]}" >> $log_file
-echo "Cloning form user $HIPSYCL_REPO_USER branch $HIPSYCL_REPO_BRANCH " >> $log_file
-
-HIPSYCL_TEST_EXCLUDE_FROM_RT=${HIPSYCL_TEST_EXCLUDE_FROM_RT:-"hip:gfx900"}
-DIR=`pwd`
-
-mkdir -p /tmp/hipSYCL-test/tests/build 
-mkdir -p /tmp/build/$distro-$backend
-
-for target in ${targets[@]}; do
-  echo "Starting test for $target for $distro" >> $log_file
-  singularity exec --cleanenv  $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro-$backend   \
-      $cmake_path \
-      -DCMAKE_PREFIX_PATH=/opt/hipSYCL/boost/boost \
-      -DCMAKE_C_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang \
-      -DCMAKE_CXX_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang++ \
-      -DHIPSYCL_TARGETS=$target \
-      -S /tmp/hipSYCL-test/tests \
-      -B /tmp/build/$distro-$backend
-   
-
-  VERBOSE=1 CUDA_VISIBLE_DEVICES=0 singularity exec --nv \
-      -H /tmp/build/$distro-$backend $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro-$backend \
-      make  -j 16
-
-  if [ ! "$target" = $HIPSYCL_TEST_EXCLUDE_FROM_RT ] ;then
-    #CUDA_VISIBLE_DEVICES=0 \
-	    singularity exec --nv \
-      $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro-$backend \
-      /tmp/build/$2-$3/sycl_tests 
-  else
-    echo "test_skipped" >> $log_file
-  fi
-  rm -rf /tmp/build/$2-$3
-done
diff --git a/devops/repos/test-packages.sh b/devops/repos/test-packages.sh
deleted file mode 100755
index 385452175..000000000
--- a/devops/repos/test-packages.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash 
-set -e 
-set -o xtrace
-source ~/envs.out
-if [ "$#" -lt 4 ]; then
-  echo "
-  This script is responsible for creating a base image (base), adding the hipSYCL repo (add_repo),
-  installing the hipSYCL package and its dependencies (install_dependencies), and then running the tests
-  (run_tests) and eventually cleaning up (clean_up) for the specified distribution and backend combination
-
-  usage:
-   <dir_of_test_script> <distro> <backends> [action: build, add_repo, intall_dependencies, run_test, clean_up] <target_repo>
-  
-  dir_of_test_scripts: Points to the directory where this script is located
-  distro: The distribution for which the packages are supposed to be tested
-  backends: A bitmask of the enabled backends, from leat to most important bit: CUDA,ROCM. 1 means enabled 0 means disabled
-  actions: build: build a container image for the specified distribution:
-                      it creates an image in the directory: HIPSYCL_TEST_DIR/distro-backend folder
-           add_repo: run the ../../install/scripts/add-repo-<distro>.sh script to add the hipSYCL repo to the base image
-           install_dependencis: Installs the version of hipSYCL with the targeted backends, in case of Cuda backend, is tested, install Cuda from an external source see
-                      ../../install/scripts/spack-install-Cuda.sh
-           run_test: executes the ./test-installation.sh script in the built singularity container.
-           clean_up: Useful if the container is going to be reused. Deletes all installed packages and Cuda if necessary.
-  target_repo: an optional path to the repository directory from the hipSYCL base repo. useful if testing experimental repos
-
-  Important ENV variables:
-    - HIPSYCL_TEST_DIR: The location where the test containers will be installed
-  "
-  exit -1
-fi
-home_dir=$1
-distro=$2
-backends=$3
-action=$4
-target_repo=$5
-
-HIPSYCL_WITH_CUDA="OFF" 
-HIPSYCL_WITH_ROCM="OFF"
-if [[ ${backends:0:1} = "1" ]]; then HIPSYCL_WITH_ROCM="ON"; else HIPSYCL_WITH_ROCM="OFF"; fi
-if [[ ${backends:1:2} = "1" ]]; then HIPSYCL_WITH_CUDA="ON"; else HIPSYCL_WITH_CUDA="OFF"; fi
-
-cd $home_dir
-source ./common/init.sh
-#slurm_out=$1/slurm-$SLURM_JOB_ID.out
-#target_repo=${2:-""}
-
-echo $slurm_out 
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-HIPSYCL_TEST_DIR=${HIPSYCL_TEST_DIR:-/tmp/hipsycl-test/}
-echo $HIPSYCL_TEST_DIR
-HIPSYCL_PKG_TYPE=${HIPSYCL_PKG_TYPE:-"-nightly"}
-mkdir -p $HIPSYCL_TEST_DIR
-export slurm_out
-
-dict_key="$HIPSYCL_WITH_ROCM$HIPSYCL_WITH_CUDA"
-echo "Starting comprehensive testing of the package repositories for ${distros[*]}"
-
-if [ "$action" = "build" ];then
-  singularity build --fakeroot --force --sandbox  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends ./definitions-test-containers/$distro.def
-
-
-elif [ "$action" = "add_repo" ]; then
-  singularity exec --fakeroot --writable  -B ../../install/scripts:/mnt \
-    $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends sh /mnt/add-hipsycl-repo/$distro.sh $target_repo 
-
-
-elif [ "$action" = "install_dep" ]; then
-  if [ "$HIPSYCL_WITH_CUDA" = "ON" ]; then
-       singularity exec --fakeroot --writable  -B ../../install/scripts:/mnt \
-         $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends sh /mnt/spack-install/cuda.sh 
-  fi
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends \
-      ${install_cmd[$distro]}${pkg_suffix[$dict_key]}-$HIPSYCL_PKG_TYPE 
-
-
-elif [ "$action" = "run_tests" ]; then
-  export HIPSYCL_WITH_CUDA
-  export HIPSYCL_WITH_ROCM 
-  echo "Start testing" 
-  HIPSYCL_PKG_CONTAINER_DIR=$HIPSYCL_TEST_DIR 
-  export HIPSYCL_PKG_CONTAINER_DIR
-  `pwd`/test-installation.sh `pwd` $distro $backends 
-
-
-elif [ "$action" = "clean_up" ]; then
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends \
-      ${cleanup_cmd[$distro]}${pkg_suffix[$dict_key]}-$HIPSYCL_PKG_TYPE
-  
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends \
-      ${cleanup_dep[$distro]} 
-
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends rm -rf /opt/hipSYCL/cuda
-fi
diff --git a/devops/repos/update_repo.sh b/devops/repos/update_repo.sh
deleted file mode 100644
index 73840b28d..000000000
--- a/devops/repos/update_repo.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-
-set -e
-set -o xtrace
-if [ $1 = "--help" ]; then
-  echo " 
-   This file is responsible for driving the packaging, building, and testing process for the hipSYCL packaging system.
-   It sets and exports defaults for the important environment variables that might concern the builds 
-   
-   Usage: $ update_repo.sh <distro> <action> [option]
-   distro: centos-7, ubuntu-18.04 etc...
-   action: build_base, build_hipsycl, package, deploy, test" 
-  exit -1
-fi
-
-distro=$1
-action=$2
-option=$3
-set +e
-source /etc/profile
-set -e
-source ${HIPSYCL_PKG_ENV_FILE:-~/envs.out}
-
-HIPSYCL_PKG_DEVOPS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-HIPSYCL_PKG_SCRIPT_DIR=${HIPSYCL_PKG_SCRIPT_DIR:-../../install/scripts/}
-HIPSYCL_PKG_SCRIPT_DIR_ABS=$HIPSYCL_PKG_DEVOPS_DIR/$HIPSYCL_PKG_SCRIPT_DIR
-HIPSYCL_PKG_REPO_BASE_DIR=${HIPSYCL_PKG_REPO_BASE_DIR:-/data/repos/}
-HIPSYCL_PKG_REPO_BASE_DIR=$HIPSYCL_PKG_REPO_BASE_DIR/$HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-HIPSYCL_PKG_PUBLIC_CONTAINER_DIR=${HIPSYCL_PKG_PUBLIC_CONTAINER_DIR:-/data/repos/singularity/}
-source $HIPSYCL_PKG_DEVOPS_DIR/common/init.sh
-
-HIPSYCL_TEST_DIR="/data/hipsyclbot/test-dir"
-mkdir -p $HIPSYCL_TEST_DIR
-
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-stable}
-
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-9}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-1}
-
-HIPSYCL_HIP_VERSION=${HIPSYCL_HIP_VERSION:-4.0.0}
-
-HIPSYCL_PKG_CONTAINER_DIR_SUFFIX=${HIPSYCL_PKG_CONTAINER_DIR_SUFFIX:-containers}
-HIPSYCL_PKG_CONTAINER_DIR_SUFFIX=${HIPSYCL_PKG_CONTAINER_DIR_SUFFIX}${HIPSYCL_PKG_NAME_SUFFIX}
-HIPSYCL_PKG_CONTAINER_DIR_NAME=${HIPSYCL_PKG_LLVM_REPO_BRANCH/release\//llvm-}-
-HIPSYCL_PKG_CONTAINER_DIR=${HIPSYCL_PKG_CONTAINER_DIR:-$HIPSYCL_PKG_SCRIPT_DIR_ABS/${HIPSYCL_PKG_CONTAINER_DIR_NAME}-${HIPSYCL_PKG_CONTAINER_DIR_SUFFIX}}
-HIPSYCL_PKG_TYPE=${HIPSYCL_PKG_TYPE:-nightly}
-
-export HIPSYCL_PKG_CONTAINER_DIR
-export HIPSYCL_PKG_LLVM_REPO_BRANCH
-export HIPSYCL_PKG_LLVM_VERSION_MAJOR
-export HIPSYCL_PKG_LLVM_VERSION_MINOR
-export HIPSYCL_PKG_LLVM_VERSION_PATCH
-export HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-export HIPSYCL_REPO_USER
-export HIPSYCL_REPO_BRANCH
-export HIPSYCL_PKG_TYPE
-export HIPSYCL_PKG_NAME_SUFFIX
-export HIPSYCL_PKG_DEVOPS_DIR
-export HIPSYCL_WITH_CUDA
-export HIPSYCL_WITH_ROCM
-
-
-if [ "$action" = "build_base" ]; then
-  bash $HIPSYCL_PKG_SCRIPT_DIR_ABS/rebuild-images.sh $distro $option
-fi
-
-if [ "$action" = "build_hipsycl" ]; then
-  bash $HIPSYCL_PKG_SCRIPT_DIR_ABS/rebuild-images.sh $distro cleanup
-  bash $HIPSYCL_PKG_SCRIPT_DIR_ABS/rebuild-images.sh $distro $option
-fi
-
-if [ "$action" = "package" ]; then
-  bash $HIPSYCL_PKG_DEVOPS_DIR/create_pkgs.sh $distro $option
-fi
-
-if [ "$action" = "deploy" ]; then 
-  bash $HIPSYCL_PKG_DEVOPS_DIR/create_repos.sh $distro
-fi
-
-if [ "$action" = "test" ]; then
-  if [ -z "${@:4}" ]; then
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option build $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option add_repo $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option install_dep $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option run_tests $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-    rm -rf /data/sbalint/singularity_tmp/*
-  else
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option $4 $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-  fi
-fi
-
-
-if [ "$action" = "pub_cont" ]; then
-   bash $HIPSYCL_PKG_DEVOPS_DIR/publish_test_container.sh $distro 
-fi

From 9c07a78445794e7b38f3da15fdbafebb7387368b Mon Sep 17 00:00:00 2001
From: VaiTon <eyadlorenzo@gmail.com>
Date: Sat, 7 Dec 2024 21:02:16 +0100
Subject: [PATCH 094/126] Build Clang plugin as module

Otherwise -Wl,--no-undefined will make the build fail
---
 bin/acpp                    | 4 +---
 src/compiler/CMakeLists.txt | 6 +++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/bin/acpp b/bin/acpp
index f806c7001..ba16ce2de 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -745,8 +745,6 @@ class acpp_config:
   def acpp_plugin_path(self):
     if sys.platform.startswith('win32'):
       return os.path.join(self.acpp_installation_path, "bin", "acpp-clang.dll")
-    elif sys.platform == "darwin":
-      return os.path.join(self.acpp_installation_path, "lib", "libacpp-clang.dylib")
     else:
       return os.path.join(self.acpp_installation_path, "lib", "libacpp-clang.so")
 
@@ -1531,7 +1529,7 @@ class llvm_sscp_invocation:
   def get_cxx_flags(self):
     flags = ["-D__ACPP_ENABLE_LLVM_SSCP_TARGET__",
             "-Xclang", "-disable-O0-optnone", "-mllvm", "-acpp-sscp"]
-    
+
     if self._config.is_export_all:
       flags += ["-mllvm","-acpp-sscp-export-all"]
 
diff --git a/src/compiler/CMakeLists.txt b/src/compiler/CMakeLists.txt
index b108eec0b..f9ebb1d39 100644
--- a/src/compiler/CMakeLists.txt
+++ b/src/compiler/CMakeLists.txt
@@ -36,13 +36,13 @@ if(WITH_ACCELERATED_CPU OR WITH_SSCP_COMPILER)
   add_library(acpp-clang-cbs OBJECT
     ${CBS_PLUGIN}
   )
-  
+
   set_property(TARGET acpp-clang-cbs PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
 if(WITH_SSCP_COMPILER)
   set(WITH_REFLECTION_BUILTINS ON)
-  set(SSCP_COMPILER 
+  set(SSCP_COMPILER
     sscp/KernelOutliningPass.cpp
     sscp/IRConstantReplacer.cpp
     sscp/DynamicFunctionSupport.cpp
@@ -75,7 +75,7 @@ else()
   set(REFLECTION_BUILTINS "")
 endif()
 
-add_library(acpp-clang SHARED
+add_library(acpp-clang MODULE
   AdaptiveCppClangPlugin.cpp
   GlobalsPruningPass.cpp
   ${SSCP_COMPILER}

From 6095b5e5d630e7dc7dfc7f824e9e54692b040bb7 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 10 Dec 2024 07:10:46 +0100
Subject: [PATCH 095/126] [algorithms] Add optional dependency lists to all
 algorithms

---
 include/hipSYCL/algorithms/algorithm.hpp      | 148 ++++++++++--------
 include/hipSYCL/algorithms/numeric.hpp        |  81 ++++------
 .../hipSYCL/algorithms/sort/bitonic_sort.hpp  |   4 +-
 3 files changed, 108 insertions(+), 125 deletions(-)

diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index e7529ffdb..b6c4e4c4b 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -80,10 +80,11 @@ inline bool should_use_memset(const sycl::device& dev) {
 
 template <class ForwardIt, class UnaryFunction2>
 sycl::event for_each(sycl::queue &q, ForwardIt first, ForwardIt last,
-                     UnaryFunction2 f) {
+                     UnaryFunction2 f,
+                     const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
+  return q.parallel_for(sycl::range{std::distance(first, last)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -91,15 +92,16 @@ sycl::event for_each(sycl::queue &q, ForwardIt first, ForwardIt last,
                         });
 }
 
-template<class ForwardIt, class Size, class UnaryFunction2>
-sycl::event for_each_n(sycl::queue& q,
-                    ForwardIt first, Size n, UnaryFunction2 f) {
+template <class ForwardIt, class Size, class UnaryFunction2>
+sycl::event for_each_n(sycl::queue &q, ForwardIt first, Size n,
+                       UnaryFunction2 f,
+                       const std::vector<sycl::event> &deps = {}) {
   if(n <= 0)
     // sycl::event{} represents a no-op that is always finished.
     // This means it does not respect prior tasks in the task graph!
     // TODO Is this okay? Can we defer this responsibility to the user?
     return sycl::event{};
-  return q.parallel_for(sycl::range{static_cast<size_t>(n)},
+  return q.parallel_for(sycl::range{static_cast<size_t>(n)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -108,12 +110,12 @@ sycl::event for_each_n(sycl::queue& q,
 }
 
 template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
-sycl::event transform(sycl::queue& q,
-                     ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 d_first,
-                     UnaryOperation unary_op) {
+sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
+                      ForwardIt2 d_first, UnaryOperation unary_op,
+                      const std::vector<sycl::event> &deps = {}) {
   if(first1 == last1)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first1, last1)},
+  return q.parallel_for(sycl::range{std::distance(first1, last1)}, deps,
                         [=](sycl::id<1> id) {
                           auto input = first1;
                           auto output = d_first;
@@ -127,10 +129,11 @@ template <class ForwardIt1, class ForwardIt2, class ForwardIt3,
           class BinaryOperation>
 sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
                       ForwardIt2 first2, ForwardIt3 d_first,
-                      BinaryOperation binary_op) {
+                      BinaryOperation binary_op,
+                      const std::vector<sycl::event> &deps = {}) {
   if(first1 == last1)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first1, last1)},
+  return q.parallel_for(sycl::range{std::distance(first1, last1)}, deps,
                         [=](sycl::id<1> id) {
                           auto input1 = first1;
                           auto input2 = first2;
@@ -144,7 +147,7 @@ sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
 
 template <class ForwardIt1, class ForwardIt2>
 sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
-                 ForwardIt2 d_first) {
+                 ForwardIt2 d_first, const std::vector<sycl::event> &deps = {}) {
   
   auto size = std::distance(first, last);
   if(size == 0)
@@ -157,9 +160,9 @@ sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
       std::is_same_v<value_type1, value_type2> &&
       util::is_contiguous<ForwardIt1>() && util::is_contiguous<ForwardIt2>() &&
       detail::should_use_memcpy(q.get_device())) {
-    return q.memcpy(&(*d_first), &(*first), size * sizeof(value_type1));
+    return q.memcpy(&(*d_first), &(*first), size * sizeof(value_type1), deps);
   } else {
-    return q.parallel_for(sycl::range{size},
+    return q.parallel_for(sycl::range{size}, deps,
                           [=](sycl::id<1> id) {
                             auto input = first;
                             auto output = d_first;
@@ -237,19 +240,21 @@ sycl::event copy_if(sycl::queue &q, util::allocation_group &scratch_allocations,
       ScanT{0}, generator, result_processor, deps);
 }
 
-template<class ForwardIt1, class Size, class ForwardIt2 >
-sycl::event copy_n(sycl::queue& q, ForwardIt1 first, Size count, ForwardIt2 result) {
+template <class ForwardIt1, class Size, class ForwardIt2>
+sycl::event copy_n(sycl::queue &q, ForwardIt1 first, Size count,
+                   ForwardIt2 result,
+                   const std::vector<sycl::event> &deps = {}) {
   if(count <= 0)
     return sycl::event{};
 
   auto last = first;
   std::advance(last, count);
-  return copy(q, first, last, result);
+  return copy(q, first, last, result, deps);
 }
 
 template <class ForwardIt, class T>
 sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
-                 const T &value) {
+                 const T &value, const std::vector<sycl::event> &deps = {}) {
   auto size = std::distance(first, last);
   if(size == 0)
     return sycl::event{};
@@ -257,7 +262,7 @@ sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
   using value_type = typename std::iterator_traits<ForwardIt>::value_type;
 
   auto invoke_kernel = [&]() -> sycl::event{
-    return q.parallel_for(sycl::range{size},
+    return q.parallel_for(sycl::range{size}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -272,7 +277,7 @@ sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
     if (detail::all_bytes_equal(value, equal_byte) &&
         detail::should_use_memset(q.get_device())) {
       return q.memset(&(*first), static_cast<int>(equal_byte),
-                      size * sizeof(T));
+                      size * sizeof(T), deps);
     } else {
       return invoke_kernel();
     }
@@ -283,21 +288,22 @@ sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
 
 template<class ForwardIt, class Size, class T >
 sycl::event fill_n(sycl::queue& q,
-                  ForwardIt first, Size count, const T& value ) {
+                  ForwardIt first, Size count, const T& value,
+                  const std::vector<sycl::event> &deps = {}) {
   if(count <= Size{0})
     return sycl::event{};
   
   auto last = first;
   std::advance(last, count);
-  return fill(q, first, last, value);
+  return fill(q, first, last, value, deps);
 }
 
-
-template<class ForwardIt, class Generator >
-sycl::event generate(sycl::queue& q, ForwardIt first, ForwardIt last, Generator g) {
+template <class ForwardIt, class Generator>
+sycl::event generate(sycl::queue &q, ForwardIt first, ForwardIt last,
+                     Generator g, const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
+  return q.parallel_for(sycl::range{std::distance(first, last)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -305,12 +311,12 @@ sycl::event generate(sycl::queue& q, ForwardIt first, ForwardIt last, Generator
                         });
 }
 
-template<class ForwardIt, class Size, class Generator >
-sycl::event generate_n(sycl::queue& q, ForwardIt first,
-                      Size count, Generator g) {
+template <class ForwardIt, class Size, class Generator>
+sycl::event generate_n(sycl::queue &q, ForwardIt first, Size count, Generator g,
+                       const std::vector<sycl::event> &deps = {}) {
   if(count <= 0)
     return sycl::event{};
-  return q.parallel_for(sycl::range{static_cast<size_t>(count)},
+  return q.parallel_for(sycl::range{static_cast<size_t>(count)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -318,37 +324,38 @@ sycl::event generate_n(sycl::queue& q, ForwardIt first,
                         });
 }
 
-template<class ForwardIt, class T>
-sycl::event replace(sycl::queue& q, ForwardIt first, ForwardIt last,
-                    const T& old_value, const T& new_value) {
+template <class ForwardIt, class T>
+sycl::event replace(sycl::queue &q, ForwardIt first, ForwardIt last,
+                    const T &old_value, const T &new_value,
+                    const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
-  return for_each(q, first, last, [=](auto& x){
+  return for_each(q, first, last,[=](auto& x){
     if(x == old_value)
       x = new_value;
-  });
+  }, deps);
 }
 
-template<class ForwardIt,
-         class UnaryPredicate, class T >
-sycl::event replace_if(sycl::queue& q, ForwardIt first, ForwardIt last,
-                      UnaryPredicate p, const T& new_value) {
+template <class ForwardIt, class UnaryPredicate, class T>
+sycl::event replace_if(sycl::queue &q, ForwardIt first, ForwardIt last,
+                       UnaryPredicate p, const T &new_value,
+                       const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
   return for_each(q, first, last, [=](auto& x){
     if(p(x))
       x = new_value;
-  });
+  }, deps);
 }
 
-template <class ForwardIt1, class ForwardIt2,
-          class UnaryPredicate, class T>
-sycl::event replace_copy_if(
-    sycl::queue& q, ForwardIt1 first,
-    ForwardIt1 last, ForwardIt2 d_first, UnaryPredicate p, const T &new_value) {
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate, class T>
+sycl::event replace_copy_if(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                            ForwardIt2 d_first, UnaryPredicate p,
+                            const T &new_value,
+                            const std::vector<sycl::event> &deps = {}) {
   if (first == last)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
+  return q.parallel_for(sycl::range{std::distance(first, last)}, deps,
                         [=](sycl::id<1> id) {
                           auto input = first;
                           auto output = d_first;
@@ -363,27 +370,27 @@ sycl::event replace_copy_if(
 }
 
 template <class ForwardIt1, class ForwardIt2, class T>
-sycl::event
-replace_copy(sycl::queue& q,
-             ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
-             const T &old_value, const T &new_value) {
+sycl::event replace_copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                         ForwardIt2 d_first, const T &old_value,
+                         const T &new_value,
+                         const std::vector<sycl::event> &deps = {}) {
   if (first == last)
     return sycl::event{};
   return replace_copy_if(
       q, first, last, d_first, [=](const auto &x) { return x == old_value; },
-      new_value);
+      new_value, deps);
 }
 
 // Need transform_reduce functionality for find etc, so forward
 // declare here.
-template <class ForwardIt, class T, class BinaryReductionOp,
+/*template <class ForwardIt, class T, class BinaryReductionOp,
           class UnaryTransformOp>
 sycl::event
 transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                  ForwardIt first, ForwardIt last, T* out, T init,
-                 BinaryReductionOp reduce, UnaryTransformOp transform);
+                 BinaryReductionOp reduce, UnaryTransformOp transform,
+                 const std::vector<sycl::event> &deps);
 
-/*
 // Need transform_reduce functionality for find etc, so forward
 // declare here.
 template <class ForwardIt, class T, class BinaryReductionOp,
@@ -419,7 +426,8 @@ using early_exit_flag_t = int;
 template <class Predicate>
 sycl::event early_exit_for_each(sycl::queue &q, std::size_t problem_size,
                                 early_exit_flag_t *output_has_exited_early,
-                                Predicate should_exit) {
+                                Predicate should_exit,
+                                const std::vector<sycl::event> &deps = {}) {
   
   std::size_t group_size = 128;
 
@@ -451,7 +459,7 @@ sycl::event early_exit_for_each(sycl::queue &q, std::size_t problem_size,
       });
     };
 
-  auto evt = q.single_task([=](){*output_has_exited_early = false;});
+  auto evt = q.single_task(deps, [=](){*output_has_exited_early = false;});
   return q.parallel_for(sycl::nd_range<1>{dispatched_global_size, group_size}, evt,
                         kernel);
 }
@@ -461,7 +469,7 @@ sycl::event early_exit_for_each(sycl::queue &q, std::size_t problem_size,
 template <class ForwardIt, class UnaryPredicate>
 sycl::event all_of(sycl::queue &q,
                    ForwardIt first, ForwardIt last, detail::early_exit_flag_t* out,
-                   UnaryPredicate p) {
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
@@ -470,7 +478,7 @@ sycl::event all_of(sycl::queue &q,
                                        auto it = first;
                                        std::advance(it, idx[0]);
                                        return !p(*it);
-                                     });
+                                     }, deps);
   return q.single_task(evt, [=](){
     *out = static_cast<detail::early_exit_flag_t>(!(*out));
   });
@@ -479,7 +487,7 @@ sycl::event all_of(sycl::queue &q,
 template <class ForwardIt, class UnaryPredicate>
 sycl::event any_of(sycl::queue &q,
                    ForwardIt first, ForwardIt last, detail::early_exit_flag_t* out,
-                   UnaryPredicate p) {
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
@@ -488,18 +496,18 @@ sycl::event any_of(sycl::queue &q,
                                        auto it = first;
                                        std::advance(it, idx[0]);
                                        return p(*it);
-                                     });
+                                     }, deps);
 }
 
 template <class ForwardIt, class UnaryPredicate>
 sycl::event none_of(sycl::queue &q,
                    ForwardIt first, ForwardIt last, detail::early_exit_flag_t* out,
-                   UnaryPredicate p) {
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
   
-  auto evt = any_of(q, first, last, out, p);
+  auto evt = any_of(q, first, last, out, p, deps);
   return q.single_task(evt, [=](){
     *out = static_cast<detail::early_exit_flag_t>(!(*out));
   });
@@ -507,12 +515,13 @@ sycl::event none_of(sycl::queue &q,
 
 template <class RandomIt, class Compare>
 sycl::event sort(sycl::queue &q, RandomIt first, RandomIt last,
-                 Compare comp = std::less<>{}) {
+                 Compare comp = std::less<>{},
+                 const std::vector<sycl::event>& deps = {}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
-  
-  return sorting::bitonic_sort(q, first, last, comp);
+
+  return sorting::bitonic_sort(q, first, last, comp, deps);
 }
 
 template< class ForwardIt1, class ForwardIt2,
@@ -538,10 +547,11 @@ sycl::event merge(sycl::queue& q,
 
   if (q.get_device().get_backend() == sycl::backend::omp)
     return merging::segmented_merge(q, first1, last1, first2, last2, d_first,
-                                    comp);
+                                    comp, 128, deps);
   else
-    return merging::hierarchical_hybrid_merge(
-        q, scratch_allocations, first1, last1, first2, last2, d_first, comp);
+    return merging::hierarchical_hybrid_merge(q, scratch_allocations, first1,
+                                              last1, first2, last2, d_first,
+                                              comp, 128, deps);
 }
 
 }
diff --git a/include/hipSYCL/algorithms/numeric.hpp b/include/hipSYCL/algorithms/numeric.hpp
index e37aa240e..11ecbe9f2 100644
--- a/include/hipSYCL/algorithms/numeric.hpp
+++ b/include/hipSYCL/algorithms/numeric.hpp
@@ -69,7 +69,8 @@ sycl::event wg_model_reduction(sycl::queue &q,
                                util::allocation_group &scratch_allocations,
                                T *output, T init, std::size_t target_num_groups,
                                std::size_t local_size, std::size_t problem_size,
-                               Kernel k, BinaryReductionOp op) {
+                               Kernel k, BinaryReductionOp op,
+                               const std::vector<sycl::event>& deps = {}) {
   assert(target_num_groups > 0);
 
   sycl::event last_event;
@@ -121,6 +122,7 @@ sycl::event wg_model_reduction(sycl::queue &q,
 
   last_event = q.submit([&](sycl::handler &cgh) {
     sycl::local_accessor<char> acc{sycl::range<1>{main_kernel_local_mem}, cgh};
+    cgh.depends_on(deps);
     cgh.parallel_for(sycl::nd_range<1>{dispatched_global_size, local_size},
                     main_kernel);
   });
@@ -135,60 +137,25 @@ template <class T, class Kernel, class BinaryReductionOp>
 sycl::event
 wg_model_reduction(sycl::queue &q, util::allocation_group &scratch_allocations,
                    T *output, T init, std::size_t target_num_groups,
-                   std::size_t problem_size, Kernel k, BinaryReductionOp op) {
+                   std::size_t problem_size, Kernel k, BinaryReductionOp op,
+                   const std::vector<sycl::event>& deps = {}) {
   return wg_model_reduction(q, scratch_allocations, output, init,
-                                  target_num_groups, 128, problem_size, k, op);
+                                  target_num_groups, 128, problem_size, k, op, deps);
 }
 
-template <class T, class Kernel, class BinaryReductionOp>
-sycl::event threading_model_reduction(sycl::queue &q,
-                                  util::allocation_group &scratch_allocations,
-                                  T *output, T init, std::size_t n, Kernel k,
-                                  BinaryReductionOp op) {
-
-  sycl::event last_event;
-  auto single_task_launcher =
-      [&](auto kernel) {
-        last_event = q.single_task(kernel);
-      };
-
-  auto operator_config = get_reduction_operator_configuration<T>(op);
-  auto reduction_descriptor = reduction::reduction_descriptor{
-      operator_config, init, output};
-  
-  reduction::threading_model::omp_thread_info_query thread_info_query;
-  reduction::threading_reduction_engine engine{thread_info_query,
-                                               &scratch_allocations};
-  auto plan = engine.create_plan(n, reduction_descriptor);
-  auto main_kernel = engine.make_main_reducing_kernel(k, plan);
-  
-  last_event = q.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(sycl::range<1>{n},
-                     main_kernel);
-  });
-
-  engine.run_additional_kernels(single_task_launcher, plan);
-  
-  return last_event;
-}
 
 template <class T, class Kernel, class BinaryReductionOp>
 sycl::event transform_reduce_impl(sycl::queue &q,
                                   util::allocation_group &scratch_allocations,
                                   T *output, T init, std::size_t n, Kernel k,
-                                  BinaryReductionOp op) {
-  if(q.get_device().is_host()) {
-#ifdef HIPSYCL_ALGORITHMS_TRANSFORM_REDUCE_HOST_THREADING_MODEL
-    return threading_model_reduction(q, scratch_allocations, output, init, n, k,
-                                     op);
-#endif
-  }
+                                  BinaryReductionOp op,
+                                  const std::vector<sycl::event>& deps) {
   sycl::device dev = q.get_device();
   std::size_t num_groups =
       dev.get_info<sycl::info::device::max_compute_units>() * 4;
 
   return wg_model_reduction(q, scratch_allocations, output, init, num_groups,
-                            n, k, op);
+                            n, k, op, deps);
 
 }
 
@@ -205,7 +172,8 @@ sycl::event
 transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                  ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, T *out,
                  T init, BinaryReductionOp reduce,
-                 BinaryTransformOp transform) {
+                 BinaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {}) {
   if(first1 == last1)
     return sycl::event{};
   
@@ -219,7 +187,7 @@ transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
   };
 
   return detail::transform_reduce_impl(q, scratch_allocations, out, init, n,
-                                       kernel, reduce);
+                                       kernel, reduce, deps);
 }
 
 template <class ForwardIt, class T, class BinaryReductionOp,
@@ -227,7 +195,8 @@ template <class ForwardIt, class T, class BinaryReductionOp,
 sycl::event
 transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                  ForwardIt first, ForwardIt last, T* out, T init,
-                 BinaryReductionOp reduce, UnaryTransformOp transform) {
+                 BinaryReductionOp reduce, UnaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {}) {
   if(first == last)
     return sycl::event{};
   
@@ -239,39 +208,43 @@ transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
   };
 
   return detail::transform_reduce_impl(q, scratch_allocations, out, init, n,
-                                       kernel, reduce);
+                                       kernel, reduce, deps);
 }
 
 template <class ForwardIt1, class ForwardIt2, class T>
 sycl::event transform_reduce(sycl::queue &q,
                              util::allocation_group &scratch_allocations,
                              ForwardIt1 first1, ForwardIt1 last1,
-                             ForwardIt2 first2, T *out, T init) {
+                             ForwardIt2 first2, T *out, T init,
+                             const std::vector<sycl::event>& deps = {}) {
   return transform_reduce(q, scratch_allocations, first1, last1, first2, out,
-                          init, std::plus<T>{}, std::multiplies<T>{});
+                          init, std::plus<T>{}, std::multiplies<T>{}, deps);
 }
 
 
 template <class ForwardIt, class T, class BinaryOp>
 sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                    ForwardIt first, ForwardIt last, T *out, T init,
-                   BinaryOp binary_op) {
+                   BinaryOp binary_op,
+                   const std::vector<sycl::event>& deps = {}) {
   return transform_reduce(q, scratch_allocations, first, last, out, init,
-                          binary_op, [](auto x) { return x; });
+                          binary_op, [](auto x) { return x; }, deps);
 }
 
 template <class ForwardIt, class T>
 sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
-                   ForwardIt first, ForwardIt last, T *out, T init) {
-  return reduce(q, scratch_allocations, first, last, out, init, std::plus<T>{});
+                   ForwardIt first, ForwardIt last, T *out, T init,
+                   const std::vector<sycl::event>& deps = {}) {
+  return reduce(q, scratch_allocations, first, last, out, init, std::plus<T>{}, deps);
 }
 
 template <class ForwardIt>
 sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                    ForwardIt first, ForwardIt last,
-                   typename std::iterator_traits<ForwardIt>::value_type *out) {
+                   typename std::iterator_traits<ForwardIt>::value_type *out,
+                   const std::vector<sycl::event>& deps = {}) {
   return reduce(q, scratch_allocations, first, last, out,
-                typename std::iterator_traits<ForwardIt>::value_type{});
+                typename std::iterator_traits<ForwardIt>::value_type{}, deps);
 }
 
 ///////////////////////////// scans /////////////////////////////////////
diff --git a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
index ed0b838a2..f2518438d 100644
--- a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
+++ b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
@@ -67,7 +67,7 @@ void bitonic_group_sort(RandomIt first, SizeT group_size, SizeT problem_size,
 
 template <class RandomIt, class Comparator>
 sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
-                         Comparator comp) {
+                         Comparator comp, const std::vector<sycl::event>& deps = {}) {
 
   std::size_t problem_size = std::distance(first, last);
   sycl::event most_recent_event;
@@ -88,7 +88,7 @@ sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
       }
     };
     if(is_first_kernel || q.is_in_order())
-      most_recent_event = q.parallel_for(problem_size, k);
+      most_recent_event = q.parallel_for(problem_size, deps, k);
     else
       most_recent_event = q.parallel_for(problem_size, most_recent_event, k);
     is_first_kernel = false;

From 2398ebbc3b07df8deb7d958b2e569468f2301d1f Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 10 Dec 2024 07:17:54 +0100
Subject: [PATCH 096/126] Add macro to disable short namespace

---
 doc/macros.md                                    | 1 +
 include/hipSYCL/sycl/detail/namespace_compat.hpp | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/doc/macros.md b/doc/macros.md
index 481ad60ad..3351002b1 100644
--- a/doc/macros.md
+++ b/doc/macros.md
@@ -68,3 +68,4 @@ Note: Some compiler drivers that AdaptiveCpp supports can compile for multiple b
 * `ACPP_STRICT_ACCESSOR_DEDUCTION` - define when building your SYCL implementation to enforce strict SYCL 2020 accessor type deduction rules. While this might be required for the correct compilation of certain SYCL code, it also disables parts of the AdaptiveCpp accessor variants performance optimization extension. As such, it can have a negative performance impact for code bound by register pressure.
 * `ACPP_ALLOW_INSTANT_SUBMISSION` - define to `1` before including `sycl.hpp` to allow submission of USM operations to in-order queues via the low-latency instant submission mechanism. Set to `0` to prevent the runtime from utilizing the instant submission mechanism. If C++ standard parallelism offloading is enabled, instant submissions are always allowed.
 * `ACPP_FORCE_INSTANT_SUBMISSION` - define to `1` before including `sycl.hpp` to imply `ACPP_ALLOW_INSTANT_SUBMISSION=1` and throw an exception when instant submission is not possible.
+* `ACPP_NO_SHORT_NAMESPACE` - if defined, disables exposing AdaptiveCpp functionality in the `acpp` namespace.
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/detail/namespace_compat.hpp b/include/hipSYCL/sycl/detail/namespace_compat.hpp
index 3ff1263ee..9d9b672c5 100644
--- a/include/hipSYCL/sycl/detail/namespace_compat.hpp
+++ b/include/hipSYCL/sycl/detail/namespace_compat.hpp
@@ -12,8 +12,14 @@
 #ifndef ACPP_NAMESPACE_COMPAT
 #define ACPP_NAMESPACE_COMPAT
 
+#ifndef ACPP_NO_SHORT_NAMESPACE
 namespace acpp {
   using namespace hipsycl;
 }
+#endif
+
+namespace adaptivecpp {
+  using namespace hipsycl;
+}
 
 #endif

From 07f0aa26426de33d1e97bd22fef592f663c3216f Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 10 Dec 2024 18:57:49 +0100
Subject: [PATCH 097/126] [SSCP][llvm-to-host] Support internal local memory
 group algorithms

---
 include/hipSYCL/compiler/cbs/IRUtils.hpp      |  1 +
 .../hipSYCL/runtime/omp/omp_code_object.hpp   |  7 +++-
 .../host/HostKernelWrapperPass.cpp            |  9 +++-
 src/libkernel/sscp/host/localmem.cpp          |  9 ++++
 src/runtime/omp/omp_queue.cpp                 | 41 ++++++++++++++-----
 5 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/include/hipSYCL/compiler/cbs/IRUtils.hpp b/include/hipSYCL/compiler/cbs/IRUtils.hpp
index 7fffdef22..2334e0a78 100644
--- a/include/hipSYCL/compiler/cbs/IRUtils.hpp
+++ b/include/hipSYCL/compiler/cbs/IRUtils.hpp
@@ -59,6 +59,7 @@ static const std::array<const char *, 3> NumGroupsGlobalNames{
     NumGroupsGlobalNameX, NumGroupsGlobalNameY, NumGroupsGlobalNameZ};
 
 static constexpr const char SscpDynamicLocalMemoryPtrName[] = "__acpp_cbs_sscp_dynamic_local_memory";
+static constexpr const char SscpInternalLocalMemoryPtrName[] = "__acpp_cbs_sscp_internal_local_memory";
 } // namespace cbs
 
 static constexpr const char SscpAnnotationsName[] = "hipsycl.sscp.annotations";
diff --git a/include/hipSYCL/runtime/omp/omp_code_object.hpp b/include/hipSYCL/runtime/omp/omp_code_object.hpp
index 0a4f655d0..576495777 100644
--- a/include/hipSYCL/runtime/omp/omp_code_object.hpp
+++ b/include/hipSYCL/runtime/omp/omp_code_object.hpp
@@ -29,14 +29,17 @@ class omp_sscp_executable_object : public code_object {
   // The kernel argument struct providing work-group information.
   struct work_group_info {
     work_group_info(rt::range<3> num_groups, rt::id<3> group_id,
-                    rt::range<3> local_size, void* local_memory)
+                    rt::range<3> local_size, void *local_memory,
+                    void *internal_local_memory)
         : _num_groups(num_groups), _group_id(group_id), _local_size(local_size),
-          _local_memory(local_memory) {}
+          _local_memory(local_memory),
+          _internal_local_memory(internal_local_memory) {}
 
     rt::range<3> _num_groups;
     rt::range<3> _group_id;
     rt::range<3> _local_size;
     void* _local_memory;
+    void* _internal_local_memory;
   };
 
   using omp_sscp_kernel = void(const work_group_info *, void **);
diff --git a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
index 932702208..7be3ff02d 100644
--- a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
+++ b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
@@ -85,7 +85,8 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
       llvm::StructType::get(llvm::ArrayType::get(SizeT, 3),                 // # groups
                             llvm::ArrayType::get(SizeT, 3),                 // group id
                             llvm::ArrayType::get(SizeT, 3),                 // local size
-                            llvm::PointerType::getUnqual(Bld.getInt8Ty())); // local memory size
+                            llvm::PointerType::getUnqual(Bld.getInt8Ty()), // local memory ptr
+                            llvm::PointerType::getUnqual(Bld.getInt8Ty())); // internal local memory ptr
   auto VoidPtrT = llvm::PointerType::getUnqual(Bld.getInt8Ty());
   auto UserArgsT = llvm::PointerType::getUnqual(VoidPtrT);
 
@@ -135,6 +136,11 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
         llvm::LLVMContext::MD_dereferenceable,
         llvm::MDNode::get(Ctx, {llvm::ConstantAsMetadata::get(Bld.getInt64(DynamicLocalMemSize))}));
 
+  auto InternalLocalMemPtr = Bld.CreateLoad(
+      VoidPtrT,
+      Bld.CreateInBoundsGEP(WorkGroupInfoT, Wrapper->getArg(0), {Bld.getInt64(0), Bld.getInt32(4)}),
+      "internal_local_mem_ptr");
+
   llvm::SmallVector<llvm::Value *> Args;
 
   auto ArgArray = Wrapper->arg_begin() + 1;
@@ -168,6 +174,7 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
     replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I], LocalSize[I]);
   }
   replaceUsesOfGVWith(*Wrapper, cbs::SscpDynamicLocalMemoryPtrName, LocalMemPtr);
+  replaceUsesOfGVWith(*Wrapper, cbs::SscpInternalLocalMemoryPtrName, InternalLocalMemPtr);
 
   F.setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
   F.replaceAllUsesWith(Wrapper);
diff --git a/src/libkernel/sscp/host/localmem.cpp b/src/libkernel/sscp/host/localmem.cpp
index 7edd7022f..f136b54be 100644
--- a/src/libkernel/sscp/host/localmem.cpp
+++ b/src/libkernel/sscp/host/localmem.cpp
@@ -11,7 +11,9 @@
 #include "hipSYCL/sycl/libkernel/sscp/builtins/localmem.hpp"
 
 extern "C" void* __acpp_cbs_sscp_dynamic_local_memory;
+extern "C" void* __acpp_cbs_sscp_internal_local_memory;
 
+HIPSYCL_SSCP_BUILTIN
 __attribute__((address_space(3))) void* __acpp_sscp_get_dynamic_local_memory() {
 
   // We rely on the host side allocating page-aligned memory. On all relevant
@@ -20,3 +22,10 @@ __attribute__((address_space(3))) void* __acpp_sscp_get_dynamic_local_memory() {
   return (__attribute__((address_space(3))) void *)(__builtin_assume_aligned(
       __acpp_cbs_sscp_dynamic_local_memory, 512));
 }
+
+
+HIPSYCL_SSCP_BUILTIN
+void* __acpp_sscp_host_get_internal_local_memory() {
+  return (void *)(__builtin_assume_aligned(
+      __acpp_cbs_sscp_internal_local_memory, 512));
+}
\ No newline at end of file
diff --git a/src/runtime/omp/omp_queue.cpp b/src/runtime/omp/omp_queue.cpp
index e5637f32b..9a789e4c6 100644
--- a/src/runtime/omp/omp_queue.cpp
+++ b/src/runtime/omp/omp_queue.cpp
@@ -186,14 +186,36 @@ std::size_t get_page_size() {
 #endif
 }
 
+void *resize_and_align(std::vector<char> &data, std::size_t size,
+                       std::size_t alignment) {
+  data.resize(size + alignment);
+  return reinterpret_cast<void*>(
+        next_multiple_of(reinterpret_cast<std::uint64_t>(data.data()),
+                         alignment));
+}
+
+void *resize_and_strongly_align(std::vector<char> &data, std::size_t size) {
+  // compiler/libkernel builtins assume alignment of at least
+  // 512 byte boundaries
+  std::size_t alignment = std::max(std::size_t{512}, get_page_size());
+  return resize_and_align(data, size, alignment);
+}
+
 result
 launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
                       const rt::range<3> &num_groups,
                       const rt::range<3> &local_size, unsigned shared_memory,
                       void **kernel_args) {
   if (num_groups.size() == 1 && shared_memory == 0) {
+    // still need to be able to support group algorithms
+    // make thread-local in case we have multiple threads submitting.
+    static thread_local std::vector<char> internal_local_memory;
+    auto aligned_internal_local_memory = resize_and_strongly_align(
+        internal_local_memory, local_size.size() * sizeof(uint64_t));
+
     omp_sscp_executable_object::work_group_info info{
-        num_groups, rt::id<3>{0, 0, 0}, local_size, nullptr};
+        num_groups, rt::id<3>{0, 0, 0}, local_size, nullptr,
+        aligned_internal_local_memory};
     kernel(&info, kernel_args);
     return make_success();
   }
@@ -210,15 +232,11 @@ launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
   {
     // get page aligned local memory from heap
     static thread_local std::vector<char> local_memory;
-
-    // compiler/libkernel builtins assume that local mem is aligned to at least
-    // 512 byte boundaries
-    const auto local_mem_alignment = std::max(std::size_t{512}, get_page_size());
-    local_memory.resize(shared_memory + local_mem_alignment);
-    auto aligned_local_memory = reinterpret_cast<void *>(
-        next_multiple_of(reinterpret_cast<std::uint64_t>(local_memory.data()),
-                         local_mem_alignment));
-
+    static thread_local std::vector<char> internal_local_memory;
+    auto aligned_local_memory =
+        resize_and_strongly_align(local_memory, shared_memory);
+    auto aligned_internal_local_memory = resize_and_strongly_align(
+        internal_local_memory, local_size.size() * sizeof(uint64_t));
 #ifdef _OPENMP
 #pragma omp for collapse(3)
 #endif
@@ -226,7 +244,8 @@ launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
       for (std::size_t j = 0; j < num_groups.get(1); ++j) {
         for (std::size_t i = 0; i < num_groups.get(0); ++i) {
           omp_sscp_executable_object::work_group_info info{
-              num_groups, rt::id<3>{i, j, k}, local_size, aligned_local_memory};
+              num_groups, rt::id<3>{i, j, k}, local_size, aligned_local_memory,
+              aligned_internal_local_memory};
           kernel(&info, kernel_args);
         }
       }

From f9a2e44e1feff9a2be1e5fd818ff964d0169931d Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 11 Dec 2024 02:45:23 +0100
Subject: [PATCH 098/126] Also install headers in AdaptiveCpp/ directory

---
 CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1b4e4a10..24576e398 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -613,11 +613,19 @@ configure_file(
 
 install(DIRECTORY include/CL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.hpp")
 install(DIRECTORY include/SYCL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.hpp")
+
 install(DIRECTORY include/hipSYCL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.hpp")
 install(DIRECTORY include/hipSYCL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY include/hipSYCL/std DESTINATION include/AdaptiveCpp/hipSYCL/ )
 install(FILES ${PROJECT_BINARY_DIR}/include/hipSYCL/common/config.hpp DESTINATION include/AdaptiveCpp/hipSYCL/common/)
 
+# This part of the installation process can be simplified once the source directory has been
+# renamed from hipSYCL to AdaptiveCpp.
+install(DIRECTORY include/hipSYCL/ DESTINATION include/AdaptiveCpp/AdaptiveCpp FILES_MATCHING PATTERN "*.hpp")
+install(DIRECTORY include/hipSYCL/ DESTINATION include/AdaptiveCpp/AdaptiveCpp FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY include/hipSYCL/std/ DESTINATION include/AdaptiveCpp/AdaptiveCpp/std )
+install(FILES ${PROJECT_BINARY_DIR}/include/hipSYCL/common/config.hpp DESTINATION include/AdaptiveCpp/AdaptiveCpp/common/)
+
 if(NOT WIN32)
 # Windows is case-insensitive, so don't copy to sycl/sycl.hpp as
 # we already have SYCL/sycl.hpp

From 454986555958a4f3b9a77c27e5d5b076f6a903bb Mon Sep 17 00:00:00 2001
From: carbotaniuman <41451839+carbotaniuman@users.noreply.github.com>
Date: Tue, 10 Dec 2024 18:06:09 -0800
Subject: [PATCH 099/126] Add `is_device_copyable`

---
 include/hipSYCL/sycl/is_device_copyable.hpp | 61 +++++++++++++++++++++
 include/hipSYCL/sycl/sycl.hpp               |  1 +
 2 files changed, 62 insertions(+)
 create mode 100644 include/hipSYCL/sycl/is_device_copyable.hpp

diff --git a/include/hipSYCL/sycl/is_device_copyable.hpp b/include/hipSYCL/sycl/is_device_copyable.hpp
new file mode 100644
index 000000000..29e3c5276
--- /dev/null
+++ b/include/hipSYCL/sycl/is_device_copyable.hpp
@@ -0,0 +1,61 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_IS_DEVICE_COPYABLE_HPP
+#define HIPSYCL_IS_DEVICE_COPYABLE_HPP
+
+#include <array>
+#include <optional>
+#include <utility>
+#include <tuple>
+#include <type_traits>
+
+// AdaptiveCPP does not use this type trait to restrict allowed
+// arguments to a kernel - this is simply provided for compatibility.
+
+#define SYCL_DEVICE_COPYABLE 1
+
+namespace hipsycl {
+namespace sycl {
+
+template <typename T> struct is_device_copyable;
+
+namespace detail {
+template <typename T, typename = void>
+struct is_device_copyable_impl : std::is_trivially_copyable<T> {};
+
+template <typename T>
+struct is_device_copyable_impl<T, std::enable_if_t<!std::is_same_v<T, std::remove_cv_t<T>>>> : is_device_copyable<std::remove_cv_t<T>> {};
+}
+
+template <typename T> struct is_device_copyable : detail::is_device_copyable_impl<T> {};
+
+template<typename T>
+inline constexpr bool is_device_copyable_v = is_device_copyable<T>::value;
+
+template <typename T>
+struct is_device_copyable<std::array<T, 0>> : std::true_type {};
+
+template <typename T, std::size_t N>
+struct is_device_copyable<std::array<T, N>> : is_device_copyable<T> {};
+
+template <typename T>
+struct is_device_copyable<std::optional<T>> : is_device_copyable<T> {};
+
+template <typename T1, typename T2>
+struct is_device_copyable<std::pair<T1, T2>> : std::bool_constant<is_device_copyable_v<T1> && is_device_copyable_v<T2>> {};
+
+template <typename... Ts>
+struct is_device_copyable<std::tuple<Ts...>> : std::bool_constant<(... && is_device_copyable_v<Ts>)> {};
+
+}
+}
+
+#endif
diff --git a/include/hipSYCL/sycl/sycl.hpp b/include/hipSYCL/sycl/sycl.hpp
index 3ff1e47b1..73a981fd1 100644
--- a/include/hipSYCL/sycl/sycl.hpp
+++ b/include/hipSYCL/sycl/sycl.hpp
@@ -64,6 +64,7 @@
 #include "version.hpp"
 #include "types.hpp"
 #include "exception.hpp"
+#include "is_device_copyable.hpp"
 #include "device_selector.hpp"
 #include "device.hpp"
 #include "platform.hpp"

From 396d950d58a322ba6eaa81879ae74aabb01d1a37 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 11 Dec 2024 06:35:49 +0100
Subject: [PATCH 100/126] Bump version to 24.10

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1b4e4a10..08676c6eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ endif()
 
 
 set(ACPP_VERSION_MAJOR 24)
-set(ACPP_VERSION_MINOR 06)
+set(ACPP_VERSION_MINOR 10)
 set(ACPP_VERSION_PATCH 0)
 
 execute_process(

From a688389b8a92ef0994d5f84722df5e0c7f6e7dfe Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 11 Dec 2024 06:28:01 +0100
Subject: [PATCH 101/126] [NFC][doc] Add user-facing documentation for
 algorithms library

---
 README.md         |   1 +
 doc/algorithms.md | 389 ++++++++++++++++++++++++++++++++++++++++++++++
 mkdocs.yml        |   1 +
 3 files changed, 391 insertions(+)
 create mode 100644 doc/algorithms.md

diff --git a/README.md b/README.md
index 4947bc70a..e7c45c43d 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,7 @@ We gratefully acknowledge [contributions](https://github.com/illuhad/hipSYCL/gra
 * AdaptiveCpp [design and architecture](doc/architecture.md)
 * AdaptiveCpp runtime [specification](doc/runtime-spec.md)
 * AdaptiveCpp [compilation model](doc/compilation.md)
+* AdaptiveCpp [parallel algorithms library](doc/algorithms.md)
 * How to use raw HIP/CUDA inside AdaptiveCpp code to create [optimized code paths](doc/hip-source-interop.md)
 * A simple SYCL example code for testing purposes can be found [here](doc/examples.md).
 * [SYCL Extensions implemented in AdaptiveCpp](doc/extensions.md)
diff --git a/doc/algorithms.md b/doc/algorithms.md
new file mode 100644
index 000000000..278083061
--- /dev/null
+++ b/doc/algorithms.md
@@ -0,0 +1,389 @@
+# AdaptiveCpp parallel algorithms library
+
+AdaptiveCpp ships with a library for common parallel primitives. This library is supported on all backends, with all compiler-based compilation flows. The library-only compilation flows `omp.library-only` and `cuda-nvcxx` are currently unsupported.
+
+The main support target is the generic JIT compiler (`--acpp-targets=generic`).
+
+## Example
+
+```c++
+#include <sycl/sycl.hpp>
+#include <AdaptiveCpp/algorithms/numeric.hpp>
+
+void run_scan(sycl::queue& q, int* device_data_ptr, int* device_output_ptr, 
+            std::size_t problem_size) {
+  // Setup handling for temporary scratch memory. Note: In production work-loads,
+  // the allocation cache should be reused by multiple algorithm invocations for
+  // optimal performance.
+  acpp::algorithms::util::allocation_cache cache{
+    acpp::algorithms::util::allocation_type::device};
+  // Create a handle for the current invocation to manage its allocation requests
+  acpp::algorithms::util::allocation_group scratch{&cache, q.get_device()};
+  // Invoke inclusive_scan
+  auto evt = acpp::algorithms::inclusive_scan(q, scratch, device_data_ptr,
+    device_data_ptr + problem_size, device_output_ptr, sycl::plus<int>{});
+  
+  evt.wait();
+}
+
+```
+
+## Basic concepts
+
+* All algorithms are exclusively supported for the SYCL 2020 USM memory management model (either `device`, `host` or `shared` allocations). The old SYCL `buffer` model is unsupported.
+* All algorithms take a `sycl::queue` to which they submit their operations. Both out-of-order and in-order queues are supported, but we recommend in-order queues for performance and since the library is better tested with in-order queues.
+* All algorithms operate asynchronously, i.e. it is the user's resposibility to synchronize appropriately before results are accessed.
+* All algorithms take an optional `const std::vector<sycl::event>&` argument that can be used to express dependencies.
+* All algorithms return a `sycl::event` which can be used for synchronization. Note: If an algorithm is invoked for a problem size of 0, then for performance reasons it immediately returns a default-constructed `sycl::event` which has a `completed` status. This is the case even if the algorithms has dependencies that are not yet complete!
+* Some algorithms require temporary scratch memory. For performance reasons, this scratch memory is cached. The AdaptiveCpp algorithms library exposes control over allocation lifetime and allocation kind for this scratch memory to users (see below).
+* The iterators passed into the algorithms need to be valid on the target device.
+
+## Allocation cache for scratch memory
+
+
+```c++
+
+namespace acpp::algorithms::util {
+
+/// Encodes which kind of allocations the allocation cache manages
+enum class allocation_type {
+  device, // device USM (sycl::malloc_device())
+  shared, // shared USM (sycl::malloc_shared())
+  host // host USM (sycl::malloc_host())
+};
+
+
+/// The allocation_cache serves as an allocation pool which can serve the
+/// need of algorithms. It releases its memory upon destruction or when purge()
+/// is called. It is the user's responsibility to ensure that neither event
+/// occurs while an algorithm using the allocation cache is still running!
+///
+/// This class is thread-safe, although it might be a good idea to check
+/// whether thread-local allocation caches might result in better performance.
+class allocation_cache {
+public:
+  /// Construct an allocation_cache for a specified memory type
+  allocation_cache(allocation_type alloc_type);
+
+  /// When the allocation_cache is destroyed, all allocations that it manages
+  /// are freed. Users must ensure that the lifetime of the object extends until all operations
+  /// using it have completed.
+  ~allocation_cache();
+
+  /// Explicitly free allocations. Users must ensure that this is not invoked before all
+  /// operations using it have completed.
+  void purge();
+};
+
+/// An allocation_group represents a handle for an algorithm invocation
+/// to manage its temporary scratch memory needs.
+/// In typical scenarios, you will want to use one allocation_group object
+/// per algorithm invocation.
+///
+/// When the allocation_group is destroyed, the allocations that were requested
+/// through it are returned to the parent cache, which might then use them
+/// to serve other requests.
+/// Therefore, users need to
+/// * either ensure that the allocation_group is not destroyed before all algorithms
+///   using it have completed
+/// * or guarantee that allocations may be safely reassigned to other operations while
+///   they are still running, e.g. because all submitted algorithms using the same
+///   allocation_cache are ordered such that no race condition on the scratch allocations may
+///   occur. (Imagine e.g. if all algorithms sharing one allocation_cache are submitted to a single
+///   in-order queue)
+///
+/// This class is not thread-safe.
+class allocation_group {
+public:
+  /// Construct allocation_group for the given cache and device.
+  ///
+  /// The user is responsible to ensure that the lifetime of the provided parent cache
+  /// exceeds the lifetime of this allocation_group.
+  ///
+  /// The device will be used to provide the memory allocation context; for
+  /// typical practical applications it will be the same device that the
+  /// algorithm is submitted to.
+  /// If the memory from this device is not accessible to the device to which
+  /// the algorithm is submitted, the behavior is undefined.
+  allocation_group(allocation_cache *parent_cache, const sycl::device &dev);
+
+  allocation_group() = default;
+  allocation_group(const allocation_group&) = delete;
+  allocation_group& operator=(const allocation_group&) = delete;
+
+  /// Releases all managed allocations to the parent cache to be reassigned to
+  /// other operations.
+  ~allocation_group();
+
+  /// Explicitly releases all managed allocations to the parent cache to be reassigned to
+  /// other operations.
+  ///
+  /// It is the user's responsibility to ensure that this function is not called
+  /// before all managed allocations can be safely returned to the parent cache.
+  void release();
+
+  /// Request new allocation with the specified number of elements of type T.
+  ///
+  /// If the parent allocation cache has an allocation of sufficient size available,
+  /// then it will be returned and made unavailable for other allocation requests.
+  /// Otherwise, a new allocation will be created.
+  template<class T>
+  T* obtain(std::size_t count);
+};
+
+
+}
+```
+
+## Algorithms
+
+The following algorithms are currently supported. Their definition aligns with their definition in the C++ STL. Please refer to the C++ reference of your choice for more information on them.
+
+Here, we will only describe AdaptiveCpp-specific behavior.
+
+### Header `<AdaptiveCpp/algorithms/algorithm.hpp>`
+
+```c++
+namespace acpp::algorithms {
+
+
+template <class ForwardIt, class UnaryFunction2>
+sycl::event for_each(sycl::queue &q, ForwardIt first, ForwardIt last,
+                     UnaryFunction2 f,
+                     const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class Size, class UnaryFunction2>
+sycl::event for_each_n(sycl::queue &q, ForwardIt first, Size n,
+                       UnaryFunction2 f,
+                       const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
+sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
+                      ForwardIt2 d_first, UnaryOperation unary_op,
+                      const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class ForwardIt3,
+          class BinaryOperation>
+sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
+                      ForwardIt2 first2, ForwardIt3 d_first,
+                      BinaryOperation binary_op,
+                      const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2>
+sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                 ForwardIt2 d_first, const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate>
+sycl::event copy_if(sycl::queue &q, util::allocation_group &scratch_allocations,
+                    ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+                    UnaryPredicate pred,
+                    std::size_t *num_elements_copied = nullptr,
+                    const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class Size, class ForwardIt2>
+sycl::event copy_n(sycl::queue &q, ForwardIt1 first, Size count,
+                   ForwardIt2 result,
+                   const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class T>
+sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
+                 const T &value, const std::vector<sycl::event> &deps = {});
+
+template<class ForwardIt, class Size, class T >
+sycl::event fill_n(sycl::queue& q,
+                  ForwardIt first, Size count, const T& value,
+                  const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class Generator>
+sycl::event generate(sycl::queue &q, ForwardIt first, ForwardIt last,
+                     Generator g, const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class Size, class Generator>
+sycl::event generate_n(sycl::queue &q, ForwardIt first, Size count, Generator g,
+                       const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class T>
+sycl::event replace(sycl::queue &q, ForwardIt first, ForwardIt last,
+                    const T &old_value, const T &new_value,
+                    const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class UnaryPredicate, class T>
+sycl::event replace_if(sycl::queue &q, ForwardIt first, ForwardIt last,
+                       UnaryPredicate p, const T &new_value,
+                       const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate, class T>
+sycl::event replace_copy_if(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                            ForwardIt2 d_first, UnaryPredicate p,
+                            const T &new_value,
+                            const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class T>
+sycl::event replace_copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                         ForwardIt2 d_first, const T &old_value,
+                         const T &new_value,
+                         const std::vector<sycl::event> &deps = {});
+
+/// The result of the operation will be stored in out.
+///
+/// out must point to device-accessible memory, and will be set to 0
+/// for a negative result, and 1 for a positive result.
+template <class ForwardIt, class UnaryPredicate>
+sycl::event all_of(sycl::queue &q,
+                   ForwardIt first, ForwardIt last, int* out,
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {});
+
+/// The result of the operation will be stored in out.
+///
+/// out must point to device-accessible memory, and will be set to 0
+/// for a negative result, and 1 for a positive result.
+template <class ForwardIt, class UnaryPredicate>
+sycl::event any_of(sycl::queue &q,
+                   ForwardIt first, ForwardIt last, int* out,
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {});
+
+/// The result of the operation will be stored in out.
+///
+/// out must point to device-accessible memory, and will be set to 0
+/// for a negative result, and 1 for a positive result.
+template <class ForwardIt, class UnaryPredicate>
+sycl::event none_of(sycl::queue &q,
+                   ForwardIt first, ForwardIt last, int* out,
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {});
+
+template <class RandomIt, class Compare>
+sycl::event sort(sycl::queue &q, RandomIt first, RandomIt last,
+                 Compare comp = std::less<>{},
+                 const std::vector<sycl::event>& deps = {});
+
+template< class ForwardIt1, class ForwardIt2,
+          class ForwardIt3, class Compare >
+sycl::event merge(sycl::queue& q,
+                  util::allocation_group &scratch_allocations,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp = std::less<>{},
+                  const std::vector<sycl::event>& deps = {});
+
+}
+
+
+```
+
+### Header `<AdaptiveCpp/algorithms/numeric.hpp>`
+
+```c++
+namespace acpp::algorithms {
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt1, class ForwardIt2, class T, class BinaryReductionOp,
+          class BinaryTransformOp>
+sycl::event
+transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, T *out,
+                 T init, BinaryReductionOp reduce,
+                 BinaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt, class T, class BinaryReductionOp,
+          class UnaryTransformOp>
+sycl::event
+transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 ForwardIt first, ForwardIt last, T* out, T init,
+                 BinaryReductionOp reduce, UnaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt1, class ForwardIt2, class T>
+sycl::event transform_reduce(sycl::queue &q,
+                             util::allocation_group &scratch_allocations,
+                             ForwardIt1 first1, ForwardIt1 last1,
+                             ForwardIt2 first2, T *out, T init,
+                             const std::vector<sycl::event>& deps = {});
+
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt, class T, class BinaryOp>
+sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                   ForwardIt first, ForwardIt last, T *out, T init,
+                   BinaryOp binary_op,
+                   const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt, class T>
+sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                   ForwardIt first, ForwardIt last, T *out, T init,
+                   const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt>
+sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                   ForwardIt first, ForwardIt last,
+                   typename std::iterator_traits<ForwardIt>::value_type *out,
+                   const std::vector<sycl::event>& deps = {});
+
+
+template <class InputIt, class OutputIt, class BinaryOp>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               T init, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt>
+sycl::event inclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+sycl::event
+exclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class T>
+sycl::event exclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp, class T>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    T init, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class T, class BinaryOp, class UnaryOp>
+sycl::event transform_exclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, T init, BinaryOp binary_op,
+    UnaryOp unary_op, const std::vector<sycl::event> &deps = {});
+
+
+}
+```
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index fbfc2b3b6..2e2001cc5 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -17,6 +17,7 @@ nav:
       - 'Macros' : 'macros.md'
       - 'SYCL interoperability' : 'hip-source-interop.md'
       - 'C++ standard parallelism offloading (stdpar)' : 'stdpar.md'
+      - 'AdaptiveCpp parallel algorithms library' : 'algorithms.md'
 
   - 'AdaptiveCpp design' : 
       - 'Architecture' : 'architecture.md'

From 5882a1b422a0388b97011d0b958611b2f04f0f7c Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Wed, 11 Dec 2024 21:50:46 +0100
Subject: [PATCH 102/126] Rename ACPP_ENABLE_ALLOCATION_TRACKING to
 ACPP_ALLOCATION_TRACKING

---
 doc/env_variables.md                 | 2 +-
 include/hipSYCL/runtime/settings.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/env_variables.md b/doc/env_variables.md
index 133e3c2db..924019591 100644
--- a/doc/env_variables.md
+++ b/doc/env_variables.md
@@ -32,7 +32,7 @@
 * `ACPP_JITOPT_IADS_RELATIVE_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): When the same argument has been passed into the kernel for this fraction of all invocations of the kernel, a new kernel will be JIT-compiled with the argument value hard-wired as constant. Not taken into account for the first application run. Default: 0.8.
 * `ACPP_JITOPT_IADS_RELATIVE_THRESHOLD_MIN_DATA`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): Only consider kernels with at least many invocations for the relative threshold described above. Default: 1024.
 * `ACPP_JITOPT_IADS_RELATIVE_EVICTION_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): If the relative frequency of a kernel argument value falls below this threshold, the statistics entry for the the argument value may be evicted if space for other values is needed.
-* `ACPP_ENABLE_ALLOCATION_TRACKING`: If set to 1, allows the AdaptiveCpp runtime to track and register the allocations that it manages. This enables additional JIT-time optimizations. Set to 0 to disable. (Default: 0)
+* `ACPP_ALLOCATION_TRACKING`: If set to 1, allows the AdaptiveCpp runtime to track and register the allocations that it manages. This enables additional JIT-time optimizations. Set to 0 to disable. (Default: 0)
 
 ## Environment variables to control dumping IR during JIT compilation
 
diff --git a/include/hipSYCL/runtime/settings.hpp b/include/hipSYCL/runtime/settings.hpp
index 82e7a2ad9..59b0781a6 100644
--- a/include/hipSYCL/runtime/settings.hpp
+++ b/include/hipSYCL/runtime/settings.hpp
@@ -147,7 +147,7 @@ HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::jitopt_iads_relative_eviction_threshold,
 HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::jitopt_iads_relative_threshold_min_data,
                               "jitopt_iads_relative_threshold_min_data",
                               std::size_t)
-HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::enable_allocation_tracking, "enable_allocation_tracking", bool)
+HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::enable_allocation_tracking, "allocation_tracking", bool)
 
 class settings
 {

From f05f21da6d1b0992ee87c50c127e034b1ba4c407 Mon Sep 17 00:00:00 2001
From: Moritz <moritz>
Date: Sun, 8 Dec 2024 19:45:49 +0100
Subject: [PATCH 103/126] Add support for replacing the work-group global
 variables inside kernels (CPU implementation) when using SSCP.

---
 include/hipSYCL/compiler/cbs/IRUtils.hpp      |  3 +
 .../host/HostKnownWgSizePass.hpp              | 38 +++++++++++++
 src/compiler/cbs/IRUtils.cpp                  | 18 ++++++
 src/compiler/llvm-to-backend/CMakeLists.txt   |  2 +-
 .../host/HostKernelWrapperPass.cpp            | 22 ++-----
 .../host/HostKnownWgSizePass.cpp              | 57 +++++++++++++++++++
 .../llvm-to-backend/host/LLVMToHost.cpp       |  2 +
 7 files changed, 124 insertions(+), 18 deletions(-)
 create mode 100644 include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp
 create mode 100644 src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp

diff --git a/include/hipSYCL/compiler/cbs/IRUtils.hpp b/include/hipSYCL/compiler/cbs/IRUtils.hpp
index 7fffdef22..b39721b4d 100644
--- a/include/hipSYCL/compiler/cbs/IRUtils.hpp
+++ b/include/hipSYCL/compiler/cbs/IRUtils.hpp
@@ -79,6 +79,9 @@ template <class PtrSet> struct PtrSetWrapper {
   auto begin() -> decltype(Set.begin()) { return Set.begin(); }
 };
 
+
+void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm::Value *To, llvm::StringRef LogPrefix = "");
+
 llvm::Loop *updateDtAndLi(llvm::LoopInfo &LI, llvm::DominatorTree &DT, const llvm::BasicBlock *B,
                           llvm::Function &F);
 
diff --git a/include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp b/include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp
new file mode 100644
index 000000000..64369b4a1
--- /dev/null
+++ b/include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp
@@ -0,0 +1,38 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_HOST_KNOWN_WG_SIZE_HPP
+#define HIPSYCL_HOST_KNOWN_WG_SIZE_HPP
+
+#include <llvm/IR/PassManager.h>
+
+namespace hipsycl {
+namespace compiler {
+
+/**
+ * SubCfgFormationPass internally uses the work-group size global variables.
+ * For example, we use them for loop trip counts.
+ * Since we know their value at run-time, we just replace all uses of the global variables with
+ * their respective constant value.
+ */
+class HostKnownWgSizePass : public llvm::PassInfoMixin<HostKnownWgSizePass> {
+  std::array<int, 3> KnownWgSize;
+
+public:
+  explicit HostKnownWgSizePass(int KnownGroupSizeX, int KnownGroupSizeY, int KnownGroupSizeZ)
+      : KnownWgSize{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ} {}
+
+  llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+};
+
+} // namespace compiler
+} // namespace hipsycl
+
+#endif
diff --git a/src/compiler/cbs/IRUtils.cpp b/src/compiler/cbs/IRUtils.cpp
index 7b598e5b2..64c022856 100644
--- a/src/compiler/cbs/IRUtils.cpp
+++ b/src/compiler/cbs/IRUtils.cpp
@@ -27,6 +27,24 @@
 namespace hipsycl::compiler::utils {
 using namespace hipsycl::compiler::cbs;
 
+void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm::Value *To, llvm::StringRef LogPrefix) {
+  auto M = F.getParent();
+  auto GV = M->getGlobalVariable(GlobalVarName);
+  if (!GV)
+    return;
+
+  HIPSYCL_DEBUG_INFO << LogPrefix << "RUOGVW: " << *GV << " with " << *To << "\n";
+  llvm::SmallVector<llvm::Instruction *> ToErase;
+  for (auto U : GV->users()) {
+    if (auto I = llvm::dyn_cast<llvm::LoadInst>(U); I && I->getFunction() == &F) {
+      HIPSYCL_DEBUG_INFO << LogPrefix << "RUOGVW: " << *I << " with " << *To << "\n";
+      I->replaceAllUsesWith(To);
+    }
+  }
+  for (auto I : ToErase)
+    I->eraseFromParent();
+}
+
 llvm::Loop *updateDtAndLi(llvm::LoopInfo &LI, llvm::DominatorTree &DT, const llvm::BasicBlock *B,
                           llvm::Function &F) {
   DT.reset();
diff --git a/src/compiler/llvm-to-backend/CMakeLists.txt b/src/compiler/llvm-to-backend/CMakeLists.txt
index 741c2ef55..68deb7695 100644
--- a/src/compiler/llvm-to-backend/CMakeLists.txt
+++ b/src/compiler/llvm-to-backend/CMakeLists.txt
@@ -220,7 +220,7 @@ if(WITH_SSCP_COMPILER)
 
     add_hipsycl_llvm_backend(
       BACKEND host
-      LIBRARY host/LLVMToHost.cpp host/HostKernelWrapperPass.cpp
+      LIBRARY host/LLVMToHost.cpp host/HostKernelWrapperPass.cpp host/HostKnownWgSizePass.cpp
       TOOL host/LLVMToHostTool.cpp)
 
     target_compile_definitions(llvm-to-host PRIVATE
diff --git a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
index 932702208..abb655dc1 100644
--- a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
+++ b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
@@ -37,6 +37,9 @@ namespace hipsycl {
 namespace compiler {
 
 namespace {
+
+constexpr llvm::StringRef PassPrefix = "[SSCP][HostKernelWrapper] ";
+
 llvm::StoreInst *storeToGlobalVar(llvm::IRBuilderBase Bld, llvm::Value *V,
                                   llvm::StringRef GlobalVarName) {
   auto M = Bld.GetInsertBlock()->getModule();
@@ -46,21 +49,7 @@ llvm::StoreInst *storeToGlobalVar(llvm::IRBuilderBase Bld, llvm::Value *V,
 }
 
 void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm::Value *To) {
-  auto M = F.getParent();
-  auto GV = M->getGlobalVariable(GlobalVarName);
-  if (!GV)
-    return;
-
-  HIPSYCL_DEBUG_INFO << "[SSCP][HostKernelWrapper] RUOGVW: " << *GV << " with " << *To << "\n";
-  llvm::SmallVector<llvm::Instruction *> ToErase;
-  for (auto U : GV->users()) {
-    if (auto I = llvm::dyn_cast<llvm::LoadInst>(U); I && I->getFunction() == &F) {
-      HIPSYCL_DEBUG_INFO << "[SSCP][HostKernelWrapper] RUOGVW: " << *I << " with " << *To << "\n";
-      I->replaceAllUsesWith(To);
-    }
-  }
-  for (auto I : ToErase)
-    I->eraseFromParent();
+  utils::replaceUsesOfGVWith(F, GlobalVarName, To, PassPrefix);
 }
 
 /*
@@ -188,8 +177,7 @@ llvm::PreservedAnalyses HostKernelWrapperPass::run(llvm::Function &F,
 
   auto Wrapper = makeWrapperFunction(F, DynamicLocalMemSize);
 
-  HIPSYCL_DEBUG_INFO << "[SSCP][HostKernelWrapper] Created kernel wrapper: " << Wrapper->getName()
-                     << "\n";
+  HIPSYCL_DEBUG_INFO << PassPrefix << "Created kernel wrapper: " << Wrapper->getName() << "\n";
 
   return llvm::PreservedAnalyses::none();
 }
diff --git a/src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp b/src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp
new file mode 100644
index 000000000..27c67e32a
--- /dev/null
+++ b/src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp
@@ -0,0 +1,57 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#include "hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp"
+#include "hipSYCL/compiler/cbs/IRUtils.hpp"
+#include "hipSYCL/compiler/cbs/SplitterAnnotationAnalysis.hpp"
+#include <array>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Type.h>
+
+namespace hipsycl {
+namespace compiler {
+
+namespace {
+
+constexpr llvm::StringRef PassPrefix = "[SSCP][HostWgSizeOpt] ";
+
+void replaceWgSizeGlobalsWithConstants(llvm::Function &F, const std::array<int, 3> &KnownWGSize) {
+  auto DL = F.getParent()->getDataLayout();
+  auto SizeT = DL.getLargestLegalIntType(F.getContext());
+
+  for (auto i = 0ul; i < 3ul; ++i) {
+    if (KnownWGSize.at(i) != 0)
+      utils::replaceUsesOfGVWith(F, cbs::LocalSizeGlobalNames.at(i),
+                                 llvm::ConstantInt::get(SizeT, KnownWGSize.at(i)), PassPrefix);
+  }
+}
+
+} // namespace
+
+llvm::PreservedAnalyses HostKnownWgSizePass::run(llvm::Function &F,
+                                                 llvm::FunctionAnalysisManager &AM) {
+
+  auto &MAM = AM.getResult<llvm::ModuleAnalysisManagerFunctionProxy>(F);
+  auto *SAA = MAM.getCachedResult<SplitterAnnotationAnalysis>(*F.getParent());
+  if (!SAA || !SAA->isKernelFunc(&F))
+    return llvm::PreservedAnalyses::all();
+
+  replaceWgSizeGlobalsWithConstants(F, KnownWgSize);
+
+  HIPSYCL_DEBUG_INFO << PassPrefix << "Replaced work-group size GVs with Constants\n";
+
+  return llvm::PreservedAnalyses::none();
+}
+
+} // namespace compiler
+} // namespace hipsycl
diff --git a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
index 80640df32..0d336f6ab 100644
--- a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
+++ b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
@@ -9,6 +9,7 @@
  */
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/llvm-to-backend/host/LLVMToHost.hpp"
+#include "hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp"
 
 #include "hipSYCL/common/debug.hpp"
 #include "hipSYCL/common/filesystem.hpp"
@@ -101,6 +102,7 @@ bool LLVMToHostTranslator::toBackendFlavor(llvm::Module &M, PassHandler &PH) {
   registerCBSPipeline(MPM, hipsycl::compiler::OptLevel::O3, true);
 
   llvm::FunctionPassManager FPM;
+  FPM.addPass(HostKnownWgSizePass{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ});
   FPM.addPass(HostKernelWrapperPass{KnownLocalMemSize});
   MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM)));
 

From ff7a0deda8cd007b4cc04e341e17e8649f6d10bc Mon Sep 17 00:00:00 2001
From: Moritz <moritz>
Date: Sun, 8 Dec 2024 19:52:06 +0100
Subject: [PATCH 104/126] Execute lit (compiler/cbs) tests also with the SSCP
 pass.

---
 tests/compiler/cbs/accumulator_for.cpp            | 4 ++++
 tests/compiler/cbs/add_modulo.cpp                 | 4 ++++
 tests/compiler/cbs/cond_between_barriers.cpp      | 4 ++++
 tests/compiler/cbs/conds.cpp                      | 4 ++++
 tests/compiler/cbs/conds_in_for.cpp               | 4 ++++
 tests/compiler/cbs/const_init_accumulator_for.cpp | 4 ++++
 tests/compiler/cbs/for_in_cond.cpp                | 4 ++++
 tests/compiler/cbs/group_barrier.cpp              | 4 ++++
 tests/compiler/cbs/item_dependent_cond_in_for.cpp | 4 ++++
 tests/compiler/cbs/item_dependent_for.cpp         | 4 ++++
 tests/compiler/cbs/multiple_indvars_for.cpp       | 4 ++++
 tests/compiler/cbs/no_barriers.cpp                | 4 ++++
 tests/compiler/cbs/reduce_const_for.cpp           | 4 ++++
 tests/compiler/cbs/reduce_do_while.cpp            | 4 ++++
 tests/compiler/cbs/reduce_for.cpp                 | 4 ++++
 tests/compiler/cbs/reduce_for_inverse_barrier.cpp | 4 ++++
 tests/compiler/cbs/reduce_nested_for.cpp          | 4 ++++
 tests/compiler/cbs/reduce_unrolled.cpp            | 4 ++++
 tests/compiler/cbs/reduce_while.cpp               | 4 ++++
 tests/compiler/cbs/reduce_while_early_update.cpp  | 4 ++++
 tests/compiler/cbs/right_heavy_cond.cpp           | 4 ++++
 tests/compiler/cbs/simple_kernel.cpp              | 4 ++++
 tests/compiler/cbs/stencil.cpp                    | 4 ++++
 tests/compiler/cbs/sycl_dgemm.cpp                 | 4 ++++
 tests/compiler/cbs/two_barrier_for.cpp            | 4 ++++
 25 files changed, 100 insertions(+)

diff --git a/tests/compiler/cbs/accumulator_for.cpp b/tests/compiler/cbs/accumulator_for.cpp
index 1ce08366f..dad75c96c 100644
--- a/tests/compiler/cbs/accumulator_for.cpp
+++ b/tests/compiler/cbs/accumulator_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/add_modulo.cpp b/tests/compiler/cbs/add_modulo.cpp
index 718c413f8..c8256ace3 100644
--- a/tests/compiler/cbs/add_modulo.cpp
+++ b/tests/compiler/cbs/add_modulo.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/cond_between_barriers.cpp b/tests/compiler/cbs/cond_between_barriers.cpp
index 2e1e815cf..b866fdfb8 100644
--- a/tests/compiler/cbs/cond_between_barriers.cpp
+++ b/tests/compiler/cbs/cond_between_barriers.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/conds.cpp b/tests/compiler/cbs/conds.cpp
index 5515d46eb..1451671aa 100644
--- a/tests/compiler/cbs/conds.cpp
+++ b/tests/compiler/cbs/conds.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/conds_in_for.cpp b/tests/compiler/cbs/conds_in_for.cpp
index f9c0e27d7..1c0d6784d 100644
--- a/tests/compiler/cbs/conds_in_for.cpp
+++ b/tests/compiler/cbs/conds_in_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/const_init_accumulator_for.cpp b/tests/compiler/cbs/const_init_accumulator_for.cpp
index fade89530..833a67a1a 100644
--- a/tests/compiler/cbs/const_init_accumulator_for.cpp
+++ b/tests/compiler/cbs/const_init_accumulator_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/for_in_cond.cpp b/tests/compiler/cbs/for_in_cond.cpp
index 0c9b2efa1..96e51a160 100644
--- a/tests/compiler/cbs/for_in_cond.cpp
+++ b/tests/compiler/cbs/for_in_cond.cpp
@@ -3,6 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/group_barrier.cpp b/tests/compiler/cbs/group_barrier.cpp
index edcf24107..0535c5c72 100644
--- a/tests/compiler/cbs/group_barrier.cpp
+++ b/tests/compiler/cbs/group_barrier.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/item_dependent_cond_in_for.cpp b/tests/compiler/cbs/item_dependent_cond_in_for.cpp
index d10a0c48c..6cde211be 100644
--- a/tests/compiler/cbs/item_dependent_cond_in_for.cpp
+++ b/tests/compiler/cbs/item_dependent_cond_in_for.cpp
@@ -3,6 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/item_dependent_for.cpp b/tests/compiler/cbs/item_dependent_for.cpp
index 5a9141ffb..94a5368d6 100644
--- a/tests/compiler/cbs/item_dependent_for.cpp
+++ b/tests/compiler/cbs/item_dependent_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/multiple_indvars_for.cpp b/tests/compiler/cbs/multiple_indvars_for.cpp
index 51a4e8bc4..53b3296e7 100644
--- a/tests/compiler/cbs/multiple_indvars_for.cpp
+++ b/tests/compiler/cbs/multiple_indvars_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/no_barriers.cpp b/tests/compiler/cbs/no_barriers.cpp
index 9ee651958..e193fdcd9 100644
--- a/tests/compiler/cbs/no_barriers.cpp
+++ b/tests/compiler/cbs/no_barriers.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/reduce_const_for.cpp b/tests/compiler/cbs/reduce_const_for.cpp
index 322bdfe08..884eccbc1 100644
--- a/tests/compiler/cbs/reduce_const_for.cpp
+++ b/tests/compiler/cbs/reduce_const_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_do_while.cpp b/tests/compiler/cbs/reduce_do_while.cpp
index 11eee1716..22f7aa726 100644
--- a/tests/compiler/cbs/reduce_do_while.cpp
+++ b/tests/compiler/cbs/reduce_do_while.cpp
@@ -3,6 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_for.cpp b/tests/compiler/cbs/reduce_for.cpp
index b483c0323..32c59070e 100644
--- a/tests/compiler/cbs/reduce_for.cpp
+++ b/tests/compiler/cbs/reduce_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/reduce_for_inverse_barrier.cpp b/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
index 728880a38..345f4ee46 100644
--- a/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
+++ b/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_nested_for.cpp b/tests/compiler/cbs/reduce_nested_for.cpp
index 14e87a679..3cf707cbb 100644
--- a/tests/compiler/cbs/reduce_nested_for.cpp
+++ b/tests/compiler/cbs/reduce_nested_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_unrolled.cpp b/tests/compiler/cbs/reduce_unrolled.cpp
index 0ac46b4c5..1bd5e7fec 100644
--- a/tests/compiler/cbs/reduce_unrolled.cpp
+++ b/tests/compiler/cbs/reduce_unrolled.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_while.cpp b/tests/compiler/cbs/reduce_while.cpp
index b5a761ad0..cc5ca5490 100644
--- a/tests/compiler/cbs/reduce_while.cpp
+++ b/tests/compiler/cbs/reduce_while.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_while_early_update.cpp b/tests/compiler/cbs/reduce_while_early_update.cpp
index 2785be3fb..3593fdc2b 100644
--- a/tests/compiler/cbs/reduce_while_early_update.cpp
+++ b/tests/compiler/cbs/reduce_while_early_update.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/right_heavy_cond.cpp b/tests/compiler/cbs/right_heavy_cond.cpp
index 5d35cec19..e3603ca41 100644
--- a/tests/compiler/cbs/right_heavy_cond.cpp
+++ b/tests/compiler/cbs/right_heavy_cond.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/simple_kernel.cpp b/tests/compiler/cbs/simple_kernel.cpp
index a7be8d9f5..dd92e49c8 100644
--- a/tests/compiler/cbs/simple_kernel.cpp
+++ b/tests/compiler/cbs/simple_kernel.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/stencil.cpp b/tests/compiler/cbs/stencil.cpp
index 064e97b19..cb1cfe12b 100644
--- a/tests/compiler/cbs/stencil.cpp
+++ b/tests/compiler/cbs/stencil.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/sycl_dgemm.cpp b/tests/compiler/cbs/sycl_dgemm.cpp
index 3b2d3a72b..62e105370 100644
--- a/tests/compiler/cbs/sycl_dgemm.cpp
+++ b/tests/compiler/cbs/sycl_dgemm.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 // adapted from https://github.com/UoB-HPC/sycl_dgemm/blob/main/dgemm.cpp
 
diff --git a/tests/compiler/cbs/two_barrier_for.cpp b/tests/compiler/cbs/two_barrier_for.cpp
index 436580779..9b65e6bf5 100644
--- a/tests/compiler/cbs/two_barrier_for.cpp
+++ b/tests/compiler/cbs/two_barrier_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
+// RUN: %t | FileCheck %s
 
 #include <array>
 #include <iostream>

From 3ae9ec982b5c8e5a110a0a94124bedfa87aab3a8 Mon Sep 17 00:00:00 2001
From: David Feltell <dave@feltell.net>
Date: Tue, 10 Dec 2024 20:10:06 +0000
Subject: [PATCH 105/126] [SSCP] Ensure custom kernel uses sycl::buffer device
 pointers

If a kernel passed to `AdaptiveCpp_enqueue_custom_operation` makes use
of (i.e. captures) a `sycl::buffer`, then the captured buffer's internal
pointer must point to its associated device memory, in order for the
custom kernel to extract the pointer and provide to e.g.
`cudaMemcpyAsync`.

The transformation from host pointer to device pointer is done at kernel
launch time by rewriting the bytes of the kernel's captured variables
using an `initialize_embedded_pointers` function. However, this function
was not being called for custom operations enqueued using
`AdaptiveCpp_enqueue_custom_operation`.

So decorate the custom kernel with a call to
`initialize_embedded_pointers`, before calling the kernel itself. Do
the decoration at enqueue time, where we have access to the lambda
object (and its size), rather than trying to introspect the
`std::function` at launch time, with its complex and opaque allocation
behaviour.

Note that there is a test for this under
`tests/sycl/extensions.cpp`:`custom_enqueue`. However, this test is
only conditionally run when targeting `cuda` (or `hip`), and not
for the `generic` target. Hackily enabling this test for the `generic`
target does successfully reproduce the problem, and prove the fix works.
However, enabling the test for the `generic` target requires careful
thought around symbol leakage and/or CI stress, and so is left for
future work (see PR discussion).
---
 include/hipSYCL/glue/kernel_launcher_data.hpp        |  3 ++-
 .../hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp  | 12 +++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/hipSYCL/glue/kernel_launcher_data.hpp b/include/hipSYCL/glue/kernel_launcher_data.hpp
index 145e0a8ad..cb747c0bb 100644
--- a/include/hipSYCL/glue/kernel_launcher_data.hpp
+++ b/include/hipSYCL/glue/kernel_launcher_data.hpp
@@ -31,6 +31,7 @@ class dag_node;
 class kernel_configuration;
 class backend_kernel_launch_capabilities;
 class hcf_kernel_info;
+class kernel_operation;
 }
 
 namespace glue {
@@ -53,7 +54,7 @@ struct kernel_launcher_data {
   rt::range<3> group_size; // <- indices must be flipped
   unsigned local_mem_size;
   // In case the launch is a custom operation
-  std::function<void(sycl::interop_handle&)> custom_op;
+  std::function<void(rt::kernel_operation*, sycl::interop_handle&)> custom_op;
 
   using invoker_function_t = rt::result (*)(
       const kernel_launcher_data &launch_config, rt::dag_node *node,
diff --git a/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp b/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp
index 4ec4ef796..fb202ee83 100644
--- a/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp
@@ -295,7 +295,12 @@ class sscp_kernel_launcher
       
     } else if constexpr (type == rt::kernel_type::custom) {
       // handled at invoke time
-      data.custom_op = k;
+      data.custom_op = [k](rt::kernel_operation * kernel_op, sycl::interop_handle& ih) mutable {
+          kernel_op->initialize_embedded_pointers(
+              static_cast<void*>(&k),
+              sizeof(Kernel));
+          k(ih);
+       };
     }
     else {
       assert(false && "Unsupported kernel type");
@@ -313,8 +318,9 @@ class sscp_kernel_launcher
       assert(backend_params);
       sycl::interop_handle handle{node->get_assigned_device(),
                                   backend_params};
-
-      launch_config.custom_op(handle);
+      auto *kernel_op =
+          static_cast<rt::kernel_operation *>(node->get_operation());
+      launch_config.custom_op(kernel_op, handle);
 
       return rt::make_success();
     } else {

From e9de070a6179bfdd4583fd136be8a94009bff5c8 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Thu, 12 Dec 2024 18:14:26 +0100
Subject: [PATCH 106/126] Avoid using AdaptiveCpp_jit namespace outside of SSCP

---
 include/hipSYCL/algorithms/util/memory_streaming.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/hipSYCL/algorithms/util/memory_streaming.hpp b/include/hipSYCL/algorithms/util/memory_streaming.hpp
index fba2b5f54..f87914a03 100644
--- a/include/hipSYCL/algorithms/util/memory_streaming.hpp
+++ b/include/hipSYCL/algorithms/util/memory_streaming.hpp
@@ -62,8 +62,8 @@ class data_streamer {
   template <class F>
   static void run(std::size_t problem_size, sycl::nd_item<1> idx,
                   F &&f) noexcept {
-    namespace jit = sycl::AdaptiveCpp_jit;
     __acpp_if_target_sscp(
+        namespace jit = sycl::AdaptiveCpp_jit;
         jit::compile_if_else(
             jit::reflect<jit::reflection_query::compiler_backend>() ==
               jit::compiler_backend::host,

From 0ddcea7129f14e3ccd1eff68336204028cd92920 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 13 Dec 2024 04:33:12 +0100
Subject: [PATCH 107/126] Advertise AdaptiveCpp_jit namespace globally

---
 include/hipSYCL/sycl/jit.hpp | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/hipSYCL/sycl/jit.hpp b/include/hipSYCL/sycl/jit.hpp
index c9b9e686e..e9e72a583 100644
--- a/include/hipSYCL/sycl/jit.hpp
+++ b/include/hipSYCL/sycl/jit.hpp
@@ -275,11 +275,21 @@ class dynamic_function_config {
 
 }
 
+#else // IS_DEVICE_PASS_SSCP
 
+// Define at least the namespace so that users can set global aliases
+// for convenience, instead of only being able to define them inside
+// __acpp_if_target_sscp().
+namespace hipsycl::sycl::AdaptiveCpp_jit {}
+
+#endif // IS_DEVICE_PASS_SSCP
+
+// Set jit alias for convenience. If SYCL ever claims this namespace
+// we will have to remove it, so this is not currently publicly advertised.
+// However, it aligns with certain early examples that were published around
+// our JIT capabilities - if users try those, we need this bit.
 namespace hipsycl::sycl::jit {
 using namespace hipsycl::sycl::AdaptiveCpp_jit;
 }
 
-#endif // IS_DEVICE_PASS_SSCP
-
 #endif

From 2942f91b627a40dc8c30420a835e83b7f146c8be Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 13 Dec 2024 05:44:01 +0100
Subject: [PATCH 108/126] Modernize nbody example and add performance
 calculation

---
 examples/CMakeLists.txt                       |   2 +-
 examples/bruteforce_nbody/CMakeLists.txt      |   2 +-
 .../bruteforce_nbody/bruteforce_nbody.cpp     | 212 ++++++++++--------
 .../bruteforce_nbody/bruteforce_nbody.hpp     |   3 +-
 4 files changed, 117 insertions(+), 102 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 949222073..024d13cff 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -4,7 +4,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-project(opensycl-examples)
+project(adaptivecpp-examples)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/../cmake)
 
diff --git a/examples/bruteforce_nbody/CMakeLists.txt b/examples/bruteforce_nbody/CMakeLists.txt
index 48ef9d39b..c2aaa47cd 100644
--- a/examples/bruteforce_nbody/CMakeLists.txt
+++ b/examples/bruteforce_nbody/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_executable(bruteforce_nbody bruteforce_nbody.cpp)
 add_sycl_to_target(TARGET bruteforce_nbody SOURCES bruteforce_nbody.cpp)
 install(TARGETS bruteforce_nbody
-        RUNTIME DESTINATION share/hipSYCL/examples/)
+        RUNTIME DESTINATION share/AdaptiveCpp/examples/)
diff --git a/examples/bruteforce_nbody/bruteforce_nbody.cpp b/examples/bruteforce_nbody/bruteforce_nbody.cpp
index 2a97e55a7..37755ac85 100644
--- a/examples/bruteforce_nbody/bruteforce_nbody.cpp
+++ b/examples/bruteforce_nbody/bruteforce_nbody.cpp
@@ -14,6 +14,7 @@
 #include <vector>
 #include <fstream>
 #include <cstdlib>
+#include <chrono>
 #include "bruteforce_nbody.hpp"
 #include "model.hpp"
 
@@ -21,7 +22,7 @@
 arithmetic_type mirror_position(const arithmetic_type mirror_pos,
                                 const arithmetic_type position)
 {
-  arithmetic_type delta = cl::sycl::fabs(mirror_pos - position);
+  arithmetic_type delta = sycl::fabs(mirror_pos - position);
   return (position <= mirror_pos) ?
         mirror_pos + delta : mirror_pos - delta;
 }
@@ -30,15 +31,10 @@ int get_num_iterations_per_output_step()
 {
   char* val = std::getenv("NBODY_ITERATIONS_PER_OUTPUT");
   if(!val)
-    return 1;
+    return 10;
   return std::stoi(val);
 }
 
-template<class T, int Dim>
-using local_accessor =
-  sycl::accessor<T,Dim,
-                 sycl::access::mode::read_write,
-                 sycl::access::target::local>;
 
 int main()
 {
@@ -102,16 +98,15 @@ int main()
                    velocities_cloud2.begin(),
                    velocities_cloud2.end());
 
-  auto particles_buffer =
-      sycl::buffer<particle_type, 1>{particles.data(), particles.size()};
-  auto velocities_buffer =
-      sycl::buffer<vector_type, 1>{velocities.data(), velocities.size()};
-  auto forces_buffer =
-      sycl::buffer<vector_type, 1>{sycl::range<1>{particles.size()}};
-
-  sycl::default_selector selector;
-  sycl::queue q{selector};
+  sycl::queue q{sycl::default_selector_v, sycl::property::queue::in_order{}};
+  
+  particle_type* particles_buffer = sycl::malloc_device<particle_type>(particles.size(), q);
+  vector_type* velocities_buffer = sycl::malloc_device<vector_type>(velocities.size(), q);
+  vector_type* forces_buffer = sycl::malloc_device<vector_type>(particles.size(), q);
 
+  q.copy(particles.data(), particles_buffer, particles.size());
+  q.copy(velocities.data(), velocities_buffer, particles.size());
+  
   auto execution_range = sycl::nd_range<1>{
       sycl::range<1>{((num_particles + local_size - 1) / local_size) * local_size},
       sycl::range<1>{local_size}
@@ -119,44 +114,51 @@ int main()
 
 
   std::ofstream outputfile{"output.txt"};
+
+  const std::size_t num_particles = particles.size();
+
+  auto start_time = std::chrono::high_resolution_clock::now();
+  double total_time = 0.0;
+
   for(std::size_t t = 0; t < num_timesteps; ++t)
   {
     // Submit force calculation
     q.submit([&](sycl::handler& cgh){
-      auto particles_access =
-          particles_buffer.get_access<sycl::access::mode::read>(cgh);
-      auto forces_access =
-          forces_buffer.get_access<sycl::access::mode::discard_write>(cgh);
 
-      auto scratch = local_accessor<particle_type, 1>{
+      auto scratch = sycl::local_accessor<particle_type, 1>{
         sycl::range<1>{local_size},
         cgh
       };
 
-      cgh.parallel_for<class force_calculation_kernel>(execution_range,
-                                                       [=](sycl::nd_item<1> tid){
-        const size_t global_id = tid.get_global_id().get(0);
-        const size_t local_id = tid.get_local_id().get(0);
-        const size_t num_particles = particles_access.get_range()[0];
+      cgh.parallel_for(execution_range,
+                      [=](sycl::nd_item<1> tid){
+        const std::size_t global_id = tid.get_global_id().get(0);
+        const std::size_t local_id = tid.get_local_id().get(0);
+        
         vector_type force{0.0f};
 
         const particle_type my_particle =
-            (global_id < num_particles) ? particles_access[global_id] : particle_type{0.0f};
+            (global_id < num_particles) ? particles_buffer[global_id] : particle_type{0.0f};
 
         for(size_t offset = 0; offset < num_particles; offset += local_size)
         {
           if(offset + local_id < num_particles)
-            scratch[local_id] = particles_access[offset + local_id];
+            scratch[local_id] = particles_buffer[offset + local_id];
           else
             scratch[local_id] = particle_type{0.0f};
-          tid.barrier();
+
+          sycl::group_barrier(tid.get_group());
 
           for(int i = 0; i < local_size; ++i)
           {
             const particle_type p = scratch[i];
             const vector_type p_direction = p.swizzle<0,1,2>();
+            // 3 flops
             const vector_type R = p_direction - my_particle.swizzle<0,1,2>();
-            // dot is not yet implemented
+            
+            // 6 flops (ignoring rsqrt, where we cannot quantify - this
+            //   will be a major source of the reported number being off
+            //   from peak)
             const arithmetic_type r_inv =
                 sycl::rsqrt(R.x()*R.x() + R.y()*R.y() + R.z()*R.z()
                                     + gravitational_softening);
@@ -164,99 +166,113 @@ int main()
             // Actually we just calculate the acceleration, not the
             // force. We only need the acceleration anyway.
             if(global_id != offset + i)
+              // 9 flops
               force += static_cast<arithmetic_type>(p.w()) * r_inv * r_inv * r_inv * R;
           }
 
-          tid.barrier();
+          sycl::group_barrier(tid.get_group());
         }
 
         if(global_id < num_particles)
-          forces_access[global_id] = force;
+          forces_buffer[global_id] = force;
       });
     });
 
     // Time integration
-    q.submit([&](cl::sycl::handler& cgh){
-      auto particles_access =
-          particles_buffer.get_access<sycl::access::mode::read_write>(cgh);
-      auto velocities_access =
-          velocities_buffer.get_access<sycl::access::mode::read_write>(cgh);
-      auto forces_access =
-          forces_buffer.get_access<sycl::access::mode::read>(cgh);
-      const arithmetic_type dt = ::dt;
-
-      cgh.parallel_for<class integration_kernel>(execution_range,
-                                                [=](sycl::nd_item<1> tid){
-        const size_t global_id = tid.get_global_id().get(0);
-        const size_t num_particles = particles_access.get_range().get(0);
+    q.parallel_for(execution_range,
+                   [=](sycl::nd_item<1> tid){
+      const size_t global_id = tid.get_global_id().get(0);
 
-        if(global_id < num_particles)
-        {
-          particle_type p = particles_access[global_id];
-          vector_type v = velocities_access[global_id];
-          const vector_type acceleration = forces_access[global_id];
-
-          // Bring v to the current state
-          v += acceleration * dt;
+      if(global_id < num_particles)
+      {
+        particle_type p = particles_buffer[global_id];
+        vector_type v = velocities_buffer[global_id];
+        const vector_type acceleration = forces_buffer[global_id];
 
-          // Update position
-          p.x() += v.x() * dt;
-          p.y() += v.y() * dt;
-          p.z() += v.z() * dt;
+        // Bring v to the current state
+        v += acceleration * dt;
 
-          // Reflect particle position and invert velocities
-          // if particles exit the simulation cube
-          if(static_cast<arithmetic_type>(p.x()) <= -half_cube_size)
-          {
-            v.x() = cl::sycl::fabs(v.x());
-            p.x() = mirror_position(-half_cube_size, p.x());
-          }
-          else if(static_cast<arithmetic_type>(p.x()) >= half_cube_size)
-          {
-            v.x() = -cl::sycl::fabs(v.x());
-            p.x() = mirror_position(half_cube_size, p.x());
-          }
+        // Update position
+        p.x() += v.x() * dt;
+        p.y() += v.y() * dt;
+        p.z() += v.z() * dt;
 
-          if(static_cast<arithmetic_type>(p.y()) <= -half_cube_size)
-          {
-            v.y() = cl::sycl::fabs(v.y());
-            p.y() = mirror_position(-half_cube_size, p.y());
-          }
-          else if(static_cast<arithmetic_type>(p.y()) >= half_cube_size)
-          {
-            v.y() = -cl::sycl::fabs(v.y());
-            p.y() = mirror_position(half_cube_size, p.y());
-          }
+        // Reflect particle position and invert velocities
+        // if particles exit the simulation cube
+        if(static_cast<arithmetic_type>(p.x()) <= -half_cube_size)
+        {
+          v.x() = sycl::fabs(v.x());
+          p.x() = mirror_position(-half_cube_size, p.x());
+        }
+        else if(static_cast<arithmetic_type>(p.x()) >= half_cube_size)
+        {
+          v.x() = -sycl::fabs(v.x());
+          p.x() = mirror_position(half_cube_size, p.x());
+        }
 
-          if(static_cast<arithmetic_type>(p.z()) <= -half_cube_size)
-          {
-            v.z() = cl::sycl::fabs(v.z());
-            p.z() = mirror_position(-half_cube_size, p.z());
-          }
-          else if(static_cast<arithmetic_type>(p.z()) >= half_cube_size)
-          {
-            v.z() = -cl::sycl::fabs(v.z());
-            p.z() = mirror_position(half_cube_size, p.z());
-          }
+        if(static_cast<arithmetic_type>(p.y()) <= -half_cube_size)
+        {
+          v.y() = sycl::fabs(v.y());
+          p.y() = mirror_position(-half_cube_size, p.y());
+        }
+        else if(static_cast<arithmetic_type>(p.y()) >= half_cube_size)
+        {
+          v.y() = -sycl::fabs(v.y());
+          p.y() = mirror_position(half_cube_size, p.y());
+        }
 
-          particles_access[global_id] = p;
-          velocities_access[global_id] = v;
+        if(static_cast<arithmetic_type>(p.z()) <= -half_cube_size)
+        {
+          v.z() = sycl::fabs(v.z());
+          p.z() = mirror_position(-half_cube_size, p.z());
         }
-      });
+        else if(static_cast<arithmetic_type>(p.z()) >= half_cube_size)
+        {
+          v.z() = -sycl::fabs(v.z());
+          p.z() = mirror_position(half_cube_size, p.z());
+        }
+
+        particles_buffer[global_id] = p;
+        velocities_buffer[global_id] = v;
+      }
     });
+  
 
     if(t % iterations_per_output == 0)
     {
-      std::cout << "Writing output..."  << std::endl;
-      auto particle_positions =
-          particles_buffer.get_access<sycl::access::mode::read>();
+      // This wait is only needed for the performance measurement.
+      // We don't need it for the algorithm itself - but we don't want
+      // to include the data transfer time in the measurement.
+      q.wait();
+      auto stop_time = std::chrono::high_resolution_clock::now();
+      total_time +=
+          std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time)
+              .count() *
+          1.e-9;
+      
+      const std::size_t flops_per_iter =
+          18 * num_particles * num_particles + 12 * num_particles;
+      std::cout << "Overall average performance: "
+                << 1.e-9 * flops_per_iter * (t + 1) / total_time << " GFlops"
+                << std::endl;
+
+      q.copy(particles_buffer, particles.data(), particles.size()).wait();
 
+      std::cout << "Writing output..."  << std::endl;
       for(std::size_t i = 0; i < num_particles; ++i)
       {
-        outputfile << particle_positions[i].x() << " "
-                   << particle_positions[i].y() << " "
-                   << particle_positions[i].z() << " " << i << std::endl;
+        outputfile << particles[i].x() << " "
+                   << particles[i].y() << " "
+                   << particles[i].z() << " " << i << std::endl;
       }
+
+      // start again for next iteration
+      start_time = std::chrono::high_resolution_clock::now();
     }
   }
+
+  q.wait();
+  sycl::free(particles_buffer, q);
+  sycl::free(velocities_buffer, q);
+  sycl::free(forces_buffer, q);
 }
diff --git a/examples/bruteforce_nbody/bruteforce_nbody.hpp b/examples/bruteforce_nbody/bruteforce_nbody.hpp
index 1e119eb36..2ba6e6b79 100644
--- a/examples/bruteforce_nbody/bruteforce_nbody.hpp
+++ b/examples/bruteforce_nbody/bruteforce_nbody.hpp
@@ -12,8 +12,7 @@
 #ifndef BRUTEFORCE_NBODY_HPP
 #define BRUTEFORCE_NBODY_HPP
 
-#include <CL/sycl.hpp>
-using namespace cl;
+#include <sycl/sycl.hpp>
 
 using arithmetic_type = float;
 using vector_type = sycl::vec<arithmetic_type, 3>;

From ea4221b8b28fa5d425ad48e04440026fe7a6e048 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 13 Dec 2024 05:49:27 +0100
Subject: [PATCH 109/126] [renaming] Remove mention of hipSYCL during cmake

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24576e398..e17ad5caf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -282,7 +282,7 @@ if(BUILD_CLANG_PLUGIN)
   if(LLVM_DIR_OLD AND NOT ("${LLVM_DIR_OLD}" STREQUAL "${LLVM_DIR}"))
     message(WARNING "Could not find LLVM in the requested location LLVM_DIR=${LLVM_DIR_OLD}; using ${LLVM_DIR}.")
   endif()
-  message(STATUS "Building hipSYCL against LLVM configured from ${LLVM_DIR}")
+  message(STATUS "Building AdaptiveCpp against LLVM configured from ${LLVM_DIR}")
   #find_package(Clang REQUIRED)
 
   find_program(CLANG_EXECUTABLE_PATH NAMES clang++-${LLVM_VERSION_MAJOR} clang++-${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} clang++ HINTS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH CACHE STRING)

From ebf62834a6293b0a9cabad98e7224a0a1904c44c Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 13 Dec 2024 23:25:08 +0100
Subject: [PATCH 110/126] [OpenMP] Improve alignment handling

---
 include/hipSYCL/glue/llvm-sscp/jit.hpp | 10 ++++++++++
 src/runtime/adaptivity_engine.cpp      |  5 +++--
 src/runtime/omp/omp_allocator.cpp      | 16 ++++++++++++----
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/include/hipSYCL/glue/llvm-sscp/jit.hpp b/include/hipSYCL/glue/llvm-sscp/jit.hpp
index 2f97dd833..6d4552fa2 100644
--- a/include/hipSYCL/glue/llvm-sscp/jit.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/jit.hpp
@@ -241,7 +241,11 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
   if(translator->getKernels().size() == 1) {
     // Currently we only can specialize kernel arguments for the 
     // single-kernel code object model
+    HIPSYCL_DEBUG_INFO << "jit: Configuring kernel "
+                       << translator->getKernels()[0] << std::endl;
     for(const auto& entry : config.specialized_arguments()) {
+      HIPSYCL_DEBUG_INFO << "jit: Specializing argument " << entry.first
+                         << " = " << entry.second << std::endl;
       translator->specializeKernelArgument(translator->getKernels().front(),
                                           entry.first, &entry.second);
     }
@@ -249,10 +253,14 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
     int num_param_indices = static_cast<int>(config.get_num_kernel_param_indices());
     for (int i = 0; i < num_param_indices; ++i) {
       if (config.has_kernel_param_flag(i, rt::kernel_param_flag::noalias)) {
+        HIPSYCL_DEBUG_INFO << "jit: Setting argument " << i << " to noalias"
+                           << std::endl;
         translator->setNoAliasKernelParam(translator->getKernels().front(), i);
       }
     }
     for(const auto& entry : config.known_alignments()) {
+      HIPSYCL_DEBUG_INFO << "jit: Setting argument " << entry.first
+                         << " to alignment " << entry.second << std::endl;
       translator->setKnownPtrParamAlignment(translator->getKernels().front(),
                                             entry.first, entry.second);
     }
@@ -260,6 +268,8 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
   for(const auto& entry : config.function_call_specialization_config()) {
     auto& config = entry.value->function_call_map;
     for(const auto& call_specialization : config) {
+      HIPSYCL_DEBUG_INFO << "jit: Specializing function call to "
+                         << call_specialization.first << std::endl;
       translator->specializeFunctionCalls(call_specialization.first,
                                           call_specialization.second, false);
     }
diff --git a/src/runtime/adaptivity_engine.cpp b/src/runtime/adaptivity_engine.cpp
index 9c900ef04..c630c23a4 100644
--- a/src/runtime/adaptivity_engine.cpp
+++ b/src/runtime/adaptivity_engine.cpp
@@ -165,13 +165,13 @@ int determine_ptr_alignment(uint64_t ptrval) {
   // do not support __has_builtin
   #define ACPP_HAS_BUILTIN_CTZ
 #else
-  #if __has_builtin(__builtin_ctz)
+  #if __has_builtin(__builtin_ctzll)
     #define ACPP_HAS_BUILTIN_CTZ
   #endif
 #endif
 
 #ifdef ACPP_HAS_BUILTIN_CTZ
-  uint64_t alignment = 1ull << __builtin_ctz(ptrval);
+  uint64_t alignment = 1ull << __builtin_ctzll(ptrval);
   return alignment >= 32 ? 32 : 0;
 #else
   return 0;
@@ -264,6 +264,7 @@ kernel_adaptivity_engine::finalize_binary_configuration(
         uint64_t buffer = 0;
         std::memcpy(&buffer, _arg_mapper.get_mapped_args()[i],
                     _kernel_info->get_argument_size(i));
+
         int alignment = determine_ptr_alignment(buffer);
         if(alignment > 0) {
           HIPSYCL_DEBUG_INFO
diff --git a/src/runtime/omp/omp_allocator.cpp b/src/runtime/omp/omp_allocator.cpp
index 94234795e..d866fccd3 100644
--- a/src/runtime/omp/omp_allocator.cpp
+++ b/src/runtime/omp/omp_allocator.cpp
@@ -22,6 +22,13 @@ omp_allocator::omp_allocator(const device_id &my_device)
     : _my_device{my_device} {}
 
 void *omp_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
+  if(min_alignment < 32) {
+    // Enforce alignment by default for performance reasons.
+    // 32 is chosen since this is what is currently needed by the adaptivity
+    // engine to consider an allocation strongly aligned.
+    return raw_allocate(32, size_bytes);
+  }
+
 #if !defined(_WIN32)
   // posix requires alignment to be a multiple of sizeof(void*)
   if (min_alignment < sizeof(void*))
@@ -35,11 +42,12 @@ void *omp_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
     min_alignment = 1;
 #endif
 
-  if(size_bytes % min_alignment != 0)
-    return nullptr;
+  if(min_alignment > 0 && size_bytes % min_alignment != 0)
+    return raw_allocate(min_alignment,
+                        next_multiple_of(size_bytes, min_alignment));
 
-  // ToDo: Mac OS CI has a problem with std::aligned_alloc
-  // but it's unclear if it's a Mac, or libc++, or toolchain issue
+    // ToDo: Mac OS CI has a problem with std::aligned_alloc
+    // but it's unclear if it's a Mac, or libc++, or toolchain issue
 #ifdef __APPLE__
   return aligned_alloc(min_alignment, size_bytes);
 #elif !defined(_WIN32)

From 0d20c9c568616649b44e353b24626473b1ed6727 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 13 Dec 2024 23:45:12 +0100
Subject: [PATCH 111/126] [NFC][doc] Align performance guide with change to
 ACPP_ALLOCATION_TRACKING naming

---
 doc/performance.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/performance.md b/doc/performance.md
index 03b613c1c..b086c56c0 100644
--- a/doc/performance.md
+++ b/doc/performance.md
@@ -89,7 +89,7 @@ This optimization process is complete when the following warning is no longer pr
 
 The extent of this can be controlled using the environment variable `ACPP_ADAPTIVITY_LEVEL`. A value of 0 disables the feature. The default is 1. Higher levels are expected to result in higher peak performance, but may require more application runs to converge to this performance. The default level of 1 usually guarantees peak performance for the second application run.
 
-Setting `ACPP_ENABLE_ALLOCATION_TRACKING=1` enables additional optimizations at adaptivity level 1.
+Setting `ACPP_ALLOCATION_TRACKING=1` enables additional optimizations at adaptivity level 1.
 
 At adaptivity level >= 2, AdaptiveCpp will enable additional, aggressive optimizations.
 In particular, AdaptiveCpp will attempt to detect invariant kernel arguments, and hardwire those as constants during JIT time. In some cases, this can result in substantial performance increases. It is thus advisable to try setting `ACPP_ADAPTIVITY_LEVEL=2` and running the application a couple of times (typically 3-4 times).
@@ -100,7 +100,7 @@ Note: Applications that are highly latency-sensitive may notice a slightly incre
 
 We recommend:
 * Experiment with `ACPP_ADAPTIVITY_LEVEL=1` and `ACPP_ADAPTIVITY_LEVEL=2`
-* Experiment with `ACPP_ENABLE_ALLOCATION_TRACKING=1` and `ACPP_ENABLE_ALLOCATION_TRACKING=0`.
+* Experiment with `ACPP_ALLOCATION_TRACKING=1` and `ACPP_ALLOCATION_TRACKING=0`.
 
 *Note: Adaptivity levels higher than 2 are currently not implemented.*
 

From 523e6cca02d5c5e5cee277cf839d1e647778f0d6 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 13 Dec 2024 06:24:29 +0100
Subject: [PATCH 112/126] [doc][OpenCL] Add note that we require patch command

---
 doc/install-ocl.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/install-ocl.md b/doc/install-ocl.md
index 42e9d4656..e4ae24550 100644
--- a/doc/install-ocl.md
+++ b/doc/install-ocl.md
@@ -2,6 +2,9 @@
 
 You will need an OpenCL implementation, and the OpenCL icd loader. The OpenCL library can be specified using `cmake -DOpenCL_LIBRARY=/path/to/libOpenCL.so`.
 
+In order to generate correct code, AdaptiveCpp needs to apply a patch to the Khronos llvm-spirv translator.
+You *must* have the `patch` command installed and available when running the AdaptiveCpp `cmake` configuration. If you have run `cmake` without the `patch` command available, please *clean your build directory* before trying again.
+
 The OpenCL backend can be enabled using `cmake -DWITH_OPENCL_BACKEND=ON` when building AdaptiveCpp.
 In order to run code successfully on an OpenCL device, it must support SPIR-V ingestion and the Intel USM (unified shared memory) extension. In a degraded mode, devices supporting OpenCL fine-grained system SVM (shared virtual memory) may work as well.
 

From d89b746527e79e8ac23707fc9046ac9a50107522 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 13 Dec 2024 07:07:43 +0100
Subject: [PATCH 113/126] [NFC][doc] Add note on CPU memcpy in NUMA context

---
 doc/performance.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/performance.md b/doc/performance.md
index 03b613c1c..f7187e07c 100644
--- a/doc/performance.md
+++ b/doc/performance.md
@@ -117,6 +117,7 @@ Clearing the cache can be accomplished by simply clearing the cache directory, e
 * When comparing CPU performance to icpx/DPC++, please note that DPC++ relies on either the Intel CPU OpenCL implementation or oneAPI construction kit to target CPUs. AdaptiveCpp can target CPUs either through OpenMP, or through OpenCL. In the latter case, it can use exactly the same OpenCL implementations that DPC++ uses for CPUs as well. So, if you notice that DPC++ performs better on CPU in some scenario, it might be a good idea to try the Intel OpenCL CPU implementation or the oneAPI construction kit with AdaptiveCpp! Drawing e.g. the conclusion that DPC++ is faster than AdaptiveCpp on CPU but only testing AdaptiveCpp's OpenMP backend is *not* correct reasoning!
 * When targeting the Intel OpenCL CPU implementation, you might also want to take into account [Intel's vectorizer tuning knobs](https://www.intel.com/content/www/us/en/docs/opencl-sdk/developer-guide-core-xeon/2018/vectorizer-knobs.html).
 * For the OpenMP backend, enable OpenMP thread pinning (e.g. `OMP_PROC_BIND=true`). AdaptiveCpp uses asynchronous worker threads for some light-weight tasks such as garbage collection, and these additional threads can interfere with kernel execution if OpenMP threads are not bound to cores.
+* In multi-socket systems or other systems with strong NUMA behavior we recommend running one AdaptiveCpp process per socket (or NUMA domain) and using e.g. MPI to exchange data between the processes. This is because the SYCL implementations for data transfer functionality (`queue::memcpy` etc) for the OpenMP backend are currently not NUMA-aware. If your code depends on fast data transfers, you might run into NUMA issues otherwise. If you don't have performance critical data transfers in your code, this might not matter. Alternatively, on the CPU backend you can always use kernels to copy data which is always expected to deliver good performance.
 
 ### With omp.* compilation flow
 * When using `OMP_PROC_BIND`, there have been observations that performance suffers substantially, if AdaptiveCpp's OpenMP backend has been compiled against a different OpenMP implementation than the one used by `acpp` under the hood. For example, if `omp.accelerated` is used, `acpp` relies on clang and typically LLVM `libomp`, while the AdaptiveCpp runtime library may have been compiled with gcc and `libgomp`. The easiest way to resolve this is to appropriately use `cmake -DCMAKE_CXX_COMPILER=...` when building AdaptiveCpp to ensure that it is built using the same compiler. **If you observe substantial performance differences between AdaptiveCpp and native OpenMP, chances are your setup is broken.**

From b66b3709a08c04d0687fc3b0cb841fca44989d46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David--Cl=C3=A9ris=20Timoth=C3=A9e?=
 <timothee.davidcleris@proton.me>
Date: Sat, 14 Dec 2024 09:34:40 +0000
Subject: [PATCH 114/126] rename flag to --acpp-dryrun-only-std-flags and add
 -I <include> matcher

---
 bin/acpp | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/bin/acpp b/bin/acpp
index 5f02a521f..f99888506 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -346,9 +346,9 @@ class acpp_config:
       'is-dryrun': option("--acpp-dryrun", "ACPP_DRYRUN", "default-is-dryrun",
 """  If set, only shows compilation commands that would be executed,
   but does not actually execute it. """),
-      'is-dryrun-noplugin': option("--acpp-dryrun-noplugin", "ACPP_DRYRUN_NOPLUGIN", "default-is-dryrun-noplugin",
+      'is-dryrun-only-std-flags': option("--acpp-dryrun-only-std-flags", "ACPP_DRYRUN_ONLYSTDFLAGS", "default-is-dryrun-only-std-flags",
 """  If set, only shows compilation commands that would be executed, 
-  but does not actually execute it. This version also remove -fplugin related flags."""),
+  but does not actually execute it. This version also remove all non standard flags."""),
       'is-explicit-multipass': option("--acpp-explicit-multipass", "ACPP_EXPLICIT_MULTIPASS",
       "default-is-explicit-multipass",
 """  If set, executes device passes as separate compiler invocation and lets AdaptiveCpp control embedding device
@@ -803,9 +803,9 @@ class acpp_config:
       return False
 
   @property
-  def is_dryrun_noplugin(self):
+  def is_dryrun_only_std_flags(self):
     try:
-      return self._is_flag_set("is-dryrun-noplugin")
+      return self._is_flag_set("is-dryrun-only-std-flags")
     except OptionNotSet:
       return False
       
@@ -940,8 +940,10 @@ def filter_cmd_args(command, verbose = False):
     "-I", "-D", "-W", "-std=","-pedantic-errors"
   ]
 
+  # you can cheat the handling of -I<include> & -I <include> by treating
+  # enable_next first with "-I " as matcher then the other with "-I" matcher
   whitelist_enable_next = [
-    "-isystem", "-o", "-c"
+    "-isystem", "-o", "-c", "-I "
   ]
 
   add_next_arg = True # to add clang call
@@ -951,16 +953,16 @@ def filter_cmd_args(command, verbose = False):
       add_next_arg = False
       continue
     
-    for w in whitelist:
+    for w in whitelist_enable_next:
       if arg.startswith(w):
         new_cmd.append(arg)
-        add_next_arg = False
+        add_next_arg = True
         continue
 
-    for w in whitelist_enable_next:
+    for w in whitelist:
       if arg.startswith(w):
         new_cmd.append(arg)
-        add_next_arg = True
+        add_next_arg = False
         continue
 
   if verbose:
@@ -971,12 +973,12 @@ def filter_cmd_args(command, verbose = False):
   return new_cmd
 
 
-def run_or_print(command, print_only, noplugin=False):
+def run_or_print(command, print_only, only_std_flags=False):
 
   if not print_only:
     return subprocess.call(command)
   else:
-    if(noplugin):
+    if(only_std_flags):
       command = filter_cmd_args(command,verbose=False)
     print(' '.join(command))
     return 0
@@ -1611,8 +1613,8 @@ class compiler:
     self._user_args = config.forwarded_compiler_arguments
     self._requires_linking = config.contains_linking_stage()
     self._requires_compilation = not config.is_pure_linking_stage()
-    self._is_dry_run = config.is_dryrun or config.is_dryrun_noplugin
-    self._no_plugins = config.is_dryrun_noplugin
+    self._is_dry_run = config.is_dryrun or config.is_dryrun_only_std_flags
+    self._only_std_flags = config.is_dryrun_only_std_flags
     self._targets = config.targets
     self._common_compiler_args = config.common_compiler_args
     self._acpp_path = config.acpp_installation_path
@@ -1946,7 +1948,7 @@ class compiler:
       args += ld_flags
 
     return run_or_print([compiler_executable] + args,
-                        self._is_dry_run, self._no_plugins)
+                        self._is_dry_run, self._only_std_flags)
 
   def run(self):
     temp_prefix = "adaptivecpp-"

From 99aad59d6f1614767fdebc70fd1977354259e5be Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Sat, 14 Dec 2024 18:23:17 +0100
Subject: [PATCH 115/126] hipSYCL -> AdaptiveCpp in error report

---
 include/hipSYCL/glue/error.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/hipSYCL/glue/error.hpp b/include/hipSYCL/glue/error.hpp
index a49ad4699..377d79c95 100644
--- a/include/hipSYCL/glue/error.hpp
+++ b/include/hipSYCL/glue/error.hpp
@@ -28,11 +28,11 @@ namespace glue {
 inline void print_async_errors(sycl::exception_list error_list) {
   if (error_list.size() > 0) {
     std::ostream& output_stream = common::output_stream::get().get_stream();
-    output_stream << "============== hipSYCL error report ============== "
+    output_stream << "============== AdaptiveCpp error report ============== "
                   << std::endl;
 
     output_stream
-        << "hipSYCL has caught the following unhandled asynchronous errors: "
+        << "AdaptiveCpp has caught the following unhandled asynchronous errors: "
         << std::endl << std::endl;
 
     int idx = 0;

From 1b9d7a883ae991ec0dbf9d0cf4ecabf6cab7a8ba Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Tue, 17 Dec 2024 18:21:48 +0100
Subject: [PATCH 116/126] [CI] Clear SSCP JIT cache before running new job

---
 .github/workflows/linux-self-hosted.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/linux-self-hosted.yml b/.github/workflows/linux-self-hosted.yml
index 188b5a8c7..5dcd30078 100644
--- a/.github/workflows/linux-self-hosted.yml
+++ b/.github/workflows/linux-self-hosted.yml
@@ -13,6 +13,9 @@ jobs:
         nvhpc_version: ['22.11']
     steps:
     - uses: actions/checkout@v4
+    - name: clear JIT cache
+      run: |
+        rm -rf ~/.acpp
     - name: build
       run : |
         mkdir build && cd build
@@ -99,6 +102,9 @@ jobs:
         cuda: ['11.0'] # Just to be able to build the backend for explicit multipass
     steps:
     - uses: actions/checkout@v4
+    - name: clear JIT cache
+      run: |
+        rm -rf ~/.acpp
     - name: build
       run : |
         mkdir build && cd build
@@ -151,6 +157,9 @@ jobs:
         clang_version: ['15']
     steps:
     - uses: actions/checkout@v4
+    - name: clear JIT cache
+      run: |
+        rm -rf ~/.acpp
     - name: build
       run : |
         mkdir build && cd build

From f8cc9a08ba5b67946642ebd1baa89a2db987ba62 Mon Sep 17 00:00:00 2001
From: Moritz <moritz>
Date: Tue, 17 Dec 2024 18:40:10 +0100
Subject: [PATCH 117/126] [SSCP][llvm-to-host] Adjust cbs (generic pass) lit
 tests to always use OMP

---
 tests/compiler/cbs/accumulator_for.cpp            | 8 ++++----
 tests/compiler/cbs/add_modulo.cpp                 | 8 ++++----
 tests/compiler/cbs/cond_between_barriers.cpp      | 8 ++++----
 tests/compiler/cbs/conds.cpp                      | 8 ++++----
 tests/compiler/cbs/conds_in_for.cpp               | 8 ++++----
 tests/compiler/cbs/const_init_accumulator_for.cpp | 8 ++++----
 tests/compiler/cbs/for_in_cond.cpp                | 8 ++++----
 tests/compiler/cbs/group_barrier.cpp              | 8 ++++----
 tests/compiler/cbs/item_dependent_cond_in_for.cpp | 8 ++++----
 tests/compiler/cbs/item_dependent_for.cpp         | 8 ++++----
 tests/compiler/cbs/multiple_indvars_for.cpp       | 8 ++++----
 tests/compiler/cbs/no_barriers.cpp                | 8 ++++----
 tests/compiler/cbs/reduce_const_for.cpp           | 8 ++++----
 tests/compiler/cbs/reduce_do_while.cpp            | 8 ++++----
 tests/compiler/cbs/reduce_for.cpp                 | 8 ++++----
 tests/compiler/cbs/reduce_for_inverse_barrier.cpp | 8 ++++----
 tests/compiler/cbs/reduce_nested_for.cpp          | 8 ++++----
 tests/compiler/cbs/reduce_unrolled.cpp            | 8 ++++----
 tests/compiler/cbs/reduce_while.cpp               | 8 ++++----
 tests/compiler/cbs/reduce_while_early_update.cpp  | 8 ++++----
 tests/compiler/cbs/right_heavy_cond.cpp           | 8 ++++----
 tests/compiler/cbs/simple_kernel.cpp              | 8 ++++----
 tests/compiler/cbs/stencil.cpp                    | 8 ++++----
 tests/compiler/cbs/sycl_dgemm.cpp                 | 8 ++++----
 tests/compiler/cbs/two_barrier_for.cpp            | 8 ++++----
 25 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/tests/compiler/cbs/accumulator_for.cpp b/tests/compiler/cbs/accumulator_for.cpp
index dad75c96c..43946df8f 100644
--- a/tests/compiler/cbs/accumulator_for.cpp
+++ b/tests/compiler/cbs/accumulator_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/add_modulo.cpp b/tests/compiler/cbs/add_modulo.cpp
index c8256ace3..729e5135e 100644
--- a/tests/compiler/cbs/add_modulo.cpp
+++ b/tests/compiler/cbs/add_modulo.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/cond_between_barriers.cpp b/tests/compiler/cbs/cond_between_barriers.cpp
index b866fdfb8..63d8cf045 100644
--- a/tests/compiler/cbs/cond_between_barriers.cpp
+++ b/tests/compiler/cbs/cond_between_barriers.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/conds.cpp b/tests/compiler/cbs/conds.cpp
index 1451671aa..6e7223791 100644
--- a/tests/compiler/cbs/conds.cpp
+++ b/tests/compiler/cbs/conds.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/conds_in_for.cpp b/tests/compiler/cbs/conds_in_for.cpp
index 1c0d6784d..60ded21af 100644
--- a/tests/compiler/cbs/conds_in_for.cpp
+++ b/tests/compiler/cbs/conds_in_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/const_init_accumulator_for.cpp b/tests/compiler/cbs/const_init_accumulator_for.cpp
index 833a67a1a..27bcc104d 100644
--- a/tests/compiler/cbs/const_init_accumulator_for.cpp
+++ b/tests/compiler/cbs/const_init_accumulator_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/for_in_cond.cpp b/tests/compiler/cbs/for_in_cond.cpp
index 96e51a160..8e84d0326 100644
--- a/tests/compiler/cbs/for_in_cond.cpp
+++ b/tests/compiler/cbs/for_in_cond.cpp
@@ -3,10 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/group_barrier.cpp b/tests/compiler/cbs/group_barrier.cpp
index 0535c5c72..9636529ec 100644
--- a/tests/compiler/cbs/group_barrier.cpp
+++ b/tests/compiler/cbs/group_barrier.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/item_dependent_cond_in_for.cpp b/tests/compiler/cbs/item_dependent_cond_in_for.cpp
index 6cde211be..5cbeacf39 100644
--- a/tests/compiler/cbs/item_dependent_cond_in_for.cpp
+++ b/tests/compiler/cbs/item_dependent_cond_in_for.cpp
@@ -3,10 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/item_dependent_for.cpp b/tests/compiler/cbs/item_dependent_for.cpp
index 94a5368d6..5e43d2f4f 100644
--- a/tests/compiler/cbs/item_dependent_for.cpp
+++ b/tests/compiler/cbs/item_dependent_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/multiple_indvars_for.cpp b/tests/compiler/cbs/multiple_indvars_for.cpp
index 53b3296e7..500659e5e 100644
--- a/tests/compiler/cbs/multiple_indvars_for.cpp
+++ b/tests/compiler/cbs/multiple_indvars_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/no_barriers.cpp b/tests/compiler/cbs/no_barriers.cpp
index e193fdcd9..bae267194 100644
--- a/tests/compiler/cbs/no_barriers.cpp
+++ b/tests/compiler/cbs/no_barriers.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/reduce_const_for.cpp b/tests/compiler/cbs/reduce_const_for.cpp
index 884eccbc1..375c511a7 100644
--- a/tests/compiler/cbs/reduce_const_for.cpp
+++ b/tests/compiler/cbs/reduce_const_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_do_while.cpp b/tests/compiler/cbs/reduce_do_while.cpp
index 22f7aa726..f09c61339 100644
--- a/tests/compiler/cbs/reduce_do_while.cpp
+++ b/tests/compiler/cbs/reduce_do_while.cpp
@@ -3,10 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_for.cpp b/tests/compiler/cbs/reduce_for.cpp
index 32c59070e..c3444764a 100644
--- a/tests/compiler/cbs/reduce_for.cpp
+++ b/tests/compiler/cbs/reduce_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/reduce_for_inverse_barrier.cpp b/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
index 345f4ee46..edc33d2ea 100644
--- a/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
+++ b/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_nested_for.cpp b/tests/compiler/cbs/reduce_nested_for.cpp
index 3cf707cbb..f8d84bb1b 100644
--- a/tests/compiler/cbs/reduce_nested_for.cpp
+++ b/tests/compiler/cbs/reduce_nested_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_unrolled.cpp b/tests/compiler/cbs/reduce_unrolled.cpp
index 1bd5e7fec..ecf4763ac 100644
--- a/tests/compiler/cbs/reduce_unrolled.cpp
+++ b/tests/compiler/cbs/reduce_unrolled.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_while.cpp b/tests/compiler/cbs/reduce_while.cpp
index cc5ca5490..56f28d8b9 100644
--- a/tests/compiler/cbs/reduce_while.cpp
+++ b/tests/compiler/cbs/reduce_while.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_while_early_update.cpp b/tests/compiler/cbs/reduce_while_early_update.cpp
index 3593fdc2b..a14e08ddc 100644
--- a/tests/compiler/cbs/reduce_while_early_update.cpp
+++ b/tests/compiler/cbs/reduce_while_early_update.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/right_heavy_cond.cpp b/tests/compiler/cbs/right_heavy_cond.cpp
index e3603ca41..ff8b6f327 100644
--- a/tests/compiler/cbs/right_heavy_cond.cpp
+++ b/tests/compiler/cbs/right_heavy_cond.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/simple_kernel.cpp b/tests/compiler/cbs/simple_kernel.cpp
index dd92e49c8..862696cab 100644
--- a/tests/compiler/cbs/simple_kernel.cpp
+++ b/tests/compiler/cbs/simple_kernel.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/stencil.cpp b/tests/compiler/cbs/stencil.cpp
index cb1cfe12b..f0f3b954d 100644
--- a/tests/compiler/cbs/stencil.cpp
+++ b/tests/compiler/cbs/stencil.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/sycl_dgemm.cpp b/tests/compiler/cbs/sycl_dgemm.cpp
index 62e105370..2ed1683eb 100644
--- a/tests/compiler/cbs/sycl_dgemm.cpp
+++ b/tests/compiler/cbs/sycl_dgemm.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 // adapted from https://github.com/UoB-HPC/sycl_dgemm/blob/main/dgemm.cpp
 
diff --git a/tests/compiler/cbs/two_barrier_for.cpp b/tests/compiler/cbs/two_barrier_for.cpp
index 9b65e6bf5..168155932 100644
--- a/tests/compiler/cbs/two_barrier_for.cpp
+++ b/tests/compiler/cbs/two_barrier_for.cpp
@@ -2,10 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu
-// RUN: %t | FileCheck %s
-// RUN: %acpp %s -o %t --acpp-targets=generic --acpp-use-accelerated-cpu -O
-// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>

From b239bab59c4a94ba3ad7af468b59b5e67efa2b6f Mon Sep 17 00:00:00 2001
From: Moritz <moritz>
Date: Tue, 17 Dec 2024 18:54:27 +0100
Subject: [PATCH 118/126] [SSCP][llvm-to-host] Move KnownWgSizeOpt from its own
 pass into HostKernelWrapperPass

---
 .../host/HostKernelWrapperPass.hpp            |  8 ++-
 .../host/HostKnownWgSizePass.hpp              | 38 -------------
 src/compiler/llvm-to-backend/CMakeLists.txt   |  2 +-
 .../host/HostKernelWrapperPass.cpp            | 12 +++-
 .../host/HostKnownWgSizePass.cpp              | 57 -------------------
 .../llvm-to-backend/host/LLVMToHost.cpp       |  4 +-
 6 files changed, 18 insertions(+), 103 deletions(-)
 delete mode 100644 include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp
 delete mode 100644 src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp

diff --git a/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp b/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp
index 9979ea10a..de544ee9b 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp
@@ -18,9 +18,13 @@ namespace compiler {
 
 class HostKernelWrapperPass : public llvm::PassInfoMixin<HostKernelWrapperPass> {
   std::int64_t DynamicLocalMemSize;
+  std::array<int, 3> KnownWgSize;
+
 public:
-  explicit HostKernelWrapperPass(std::int64_t DynamicLocalMemSize)
-      : DynamicLocalMemSize{DynamicLocalMemSize} {}
+  explicit HostKernelWrapperPass(std::int64_t DynamicLocalMemSize, int KnownGroupSizeX,
+                                 int KnownGroupSizeY, int KnownGroupSizeZ)
+      : DynamicLocalMemSize{DynamicLocalMemSize},
+        KnownWgSize{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ} {}
 
   llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
   static bool isRequired() { return true; }
diff --git a/include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp b/include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp
deleted file mode 100644
index 64369b4a1..000000000
--- a/include/hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
- * parallelism for CPUs and GPUs.
- *
- * Copyright The AdaptiveCpp Contributors
- *
- * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
- * See file LICENSE in the project root for full license details.
- */
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef HIPSYCL_HOST_KNOWN_WG_SIZE_HPP
-#define HIPSYCL_HOST_KNOWN_WG_SIZE_HPP
-
-#include <llvm/IR/PassManager.h>
-
-namespace hipsycl {
-namespace compiler {
-
-/**
- * SubCfgFormationPass internally uses the work-group size global variables.
- * For example, we use them for loop trip counts.
- * Since we know their value at run-time, we just replace all uses of the global variables with
- * their respective constant value.
- */
-class HostKnownWgSizePass : public llvm::PassInfoMixin<HostKnownWgSizePass> {
-  std::array<int, 3> KnownWgSize;
-
-public:
-  explicit HostKnownWgSizePass(int KnownGroupSizeX, int KnownGroupSizeY, int KnownGroupSizeZ)
-      : KnownWgSize{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ} {}
-
-  llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
-};
-
-} // namespace compiler
-} // namespace hipsycl
-
-#endif
diff --git a/src/compiler/llvm-to-backend/CMakeLists.txt b/src/compiler/llvm-to-backend/CMakeLists.txt
index 68deb7695..741c2ef55 100644
--- a/src/compiler/llvm-to-backend/CMakeLists.txt
+++ b/src/compiler/llvm-to-backend/CMakeLists.txt
@@ -220,7 +220,7 @@ if(WITH_SSCP_COMPILER)
 
     add_hipsycl_llvm_backend(
       BACKEND host
-      LIBRARY host/LLVMToHost.cpp host/HostKernelWrapperPass.cpp host/HostKnownWgSizePass.cpp
+      LIBRARY host/LLVMToHost.cpp host/HostKernelWrapperPass.cpp
       TOOL host/LLVMToHostTool.cpp)
 
     target_compile_definitions(llvm-to-host PRIVATE
diff --git a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
index abb655dc1..9f87f9c7b 100644
--- a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
+++ b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
@@ -63,7 +63,8 @@ void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm:
  * This makes calling the kernel from the host code straighforward, as only the work group info
  * struct and the user arguments need to be passed to the wrapper.
  */
-llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocalMemSize) {
+llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocalMemSize,
+                                    const std::array<int, 3> &KnownWgSize) {
   auto M = F.getParent();
   auto &Ctx = M->getContext();
 
@@ -156,6 +157,13 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
     replaceUsesOfGVWith(*Wrapper, cbs::GroupIdGlobalNames[I], GroupIds[I]);
     replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I], LocalSize[I]);
   }
+
+  for (auto i = 0ul; i < 3ul; ++i) {
+    if (KnownWgSize.at(i) != 0)
+      utils::replaceUsesOfGVWith(F, cbs::LocalSizeGlobalNames.at(i),
+                                 llvm::ConstantInt::get(SizeT, KnownWgSize.at(i)), PassPrefix);
+  }
+
   replaceUsesOfGVWith(*Wrapper, cbs::SscpDynamicLocalMemoryPtrName, LocalMemPtr);
 
   F.setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
@@ -175,7 +183,7 @@ llvm::PreservedAnalyses HostKernelWrapperPass::run(llvm::Function &F,
   if (!SAA || !SAA->isKernelFunc(&F))
     return llvm::PreservedAnalyses::all();
 
-  auto Wrapper = makeWrapperFunction(F, DynamicLocalMemSize);
+  auto Wrapper = makeWrapperFunction(F, DynamicLocalMemSize, KnownWgSize);
 
   HIPSYCL_DEBUG_INFO << PassPrefix << "Created kernel wrapper: " << Wrapper->getName() << "\n";
 
diff --git a/src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp b/src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp
deleted file mode 100644
index 27c67e32a..000000000
--- a/src/compiler/llvm-to-backend/host/HostKnownWgSizePass.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
- * parallelism for CPUs and GPUs.
- *
- * Copyright The AdaptiveCpp Contributors
- *
- * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
- * See file LICENSE in the project root for full license details.
- */
-// SPDX-License-Identifier: BSD-2-Clause
-#include "hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp"
-#include "hipSYCL/compiler/cbs/IRUtils.hpp"
-#include "hipSYCL/compiler/cbs/SplitterAnnotationAnalysis.hpp"
-#include <array>
-#include <llvm/IR/Constants.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/PassManager.h>
-#include <llvm/IR/Type.h>
-
-namespace hipsycl {
-namespace compiler {
-
-namespace {
-
-constexpr llvm::StringRef PassPrefix = "[SSCP][HostWgSizeOpt] ";
-
-void replaceWgSizeGlobalsWithConstants(llvm::Function &F, const std::array<int, 3> &KnownWGSize) {
-  auto DL = F.getParent()->getDataLayout();
-  auto SizeT = DL.getLargestLegalIntType(F.getContext());
-
-  for (auto i = 0ul; i < 3ul; ++i) {
-    if (KnownWGSize.at(i) != 0)
-      utils::replaceUsesOfGVWith(F, cbs::LocalSizeGlobalNames.at(i),
-                                 llvm::ConstantInt::get(SizeT, KnownWGSize.at(i)), PassPrefix);
-  }
-}
-
-} // namespace
-
-llvm::PreservedAnalyses HostKnownWgSizePass::run(llvm::Function &F,
-                                                 llvm::FunctionAnalysisManager &AM) {
-
-  auto &MAM = AM.getResult<llvm::ModuleAnalysisManagerFunctionProxy>(F);
-  auto *SAA = MAM.getCachedResult<SplitterAnnotationAnalysis>(*F.getParent());
-  if (!SAA || !SAA->isKernelFunc(&F))
-    return llvm::PreservedAnalyses::all();
-
-  replaceWgSizeGlobalsWithConstants(F, KnownWgSize);
-
-  HIPSYCL_DEBUG_INFO << PassPrefix << "Replaced work-group size GVs with Constants\n";
-
-  return llvm::PreservedAnalyses::none();
-}
-
-} // namespace compiler
-} // namespace hipsycl
diff --git a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
index 0d336f6ab..0a9dae7f2 100644
--- a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
+++ b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
@@ -9,7 +9,6 @@
  */
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/llvm-to-backend/host/LLVMToHost.hpp"
-#include "hipSYCL/compiler/llvm-to-backend/host/HostKnownWgSizePass.hpp"
 
 #include "hipSYCL/common/debug.hpp"
 #include "hipSYCL/common/filesystem.hpp"
@@ -102,8 +101,7 @@ bool LLVMToHostTranslator::toBackendFlavor(llvm::Module &M, PassHandler &PH) {
   registerCBSPipeline(MPM, hipsycl::compiler::OptLevel::O3, true);
 
   llvm::FunctionPassManager FPM;
-  FPM.addPass(HostKnownWgSizePass{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ});
-  FPM.addPass(HostKernelWrapperPass{KnownLocalMemSize});
+  FPM.addPass(HostKernelWrapperPass{KnownLocalMemSize, KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ});
   MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM)));
 
   MPM.run(M, *PH.ModuleAnalysisManager);

From 634c2f6a8fc484d23c0c04daa3558bdb023f1ffe Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 20 Dec 2024 00:21:58 +0100
Subject: [PATCH 119/126] [NFC][doc] Update SSCP implementation status

With #1651, group algorithms are no longer unimplemented in SSCP.
---
 doc/compilation.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/doc/compilation.md b/doc/compilation.md
index 77f0e7e65..9b7f5541b 100644
--- a/doc/compilation.md
+++ b/doc/compilation.md
@@ -37,9 +37,7 @@ The generic SSCP flow can potentially provide very fast compile times, very good
 
 ### Implementation status
 
-The SSCP flow is supported for all backends.
-
-Some features (e.g. SYCL 2020 reductions or group algorithms) are not yet implemented.
+The SSCP flow is supported for all backends. The set of supported features is a strict superset of the features of other compilation flows. The only exception to this is the ability to mix-and-match SYCL with other backend-specific programming models.
 
 ### How it works
 

From 7d9149fcfaa054db7939004e05c418318bca200e Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Thu, 12 Dec 2024 02:33:29 +0100
Subject: [PATCH 120/126] Implement group algorithms

---
 .../sycl/libkernel/group_functions.hpp        |   3 +-
 .../libkernel/sscp/builtins/broadcast.hpp     |   6 +-
 .../sycl/libkernel/sscp/builtins/core.hpp     | 166 +-----
 .../libkernel/sscp/builtins/core_typed.hpp    | 198 +++++++
 .../sscp/builtins/detail/broadcast.hpp        |  41 ++
 .../sscp/builtins/detail/reduction.hpp        | 107 ++++
 .../sscp/builtins/detail/scan_generic.hpp     |  97 ++++
 .../sscp/builtins/detail/scan_hiplike.hpp     |  61 ++
 .../sscp/builtins/detail/scan_host.hpp        |  75 +++
 .../sscp/builtins/detail/scan_subgroup.hpp    |  53 ++
 .../sscp/builtins/detail/shuffle.hpp          |  75 +++
 .../libkernel/sscp/builtins/detail/utils.hpp  |  96 +++
 .../libkernel/sscp/builtins/reduction.hpp     |   7 +-
 .../sscp/builtins/scan_exclusive.hpp          | 104 ++++
 .../sscp/builtins/scan_inclusive.hpp          |  88 +++
 .../sycl/libkernel/sscp/builtins/shuffle.hpp  |   5 +-
 .../sycl/libkernel/sscp/group_functions.hpp   | 546 +++++++++++++++---
 src/libkernel/sscp/amdgpu/CMakeLists.txt      |  19 +-
 src/libkernel/sscp/amdgpu/broadcast.cpp       |  38 ++
 src/libkernel/sscp/amdgpu/collpredicate.cpp   |  47 ++
 src/libkernel/sscp/amdgpu/reduction.cpp       | 148 +++++
 src/libkernel/sscp/amdgpu/scan_exclusive.cpp  | 162 ++++++
 src/libkernel/sscp/amdgpu/scan_inclusive.cpp  | 151 +++++
 src/libkernel/sscp/amdgpu/shuffle.cpp         | 136 +++++
 src/libkernel/sscp/host/CMakeLists.txt        |   8 +-
 src/libkernel/sscp/host/broadcast.cpp         |  40 ++
 src/libkernel/sscp/host/collpredicate.cpp     |  48 ++
 src/libkernel/sscp/host/reduction.cpp         | 152 +++++
 src/libkernel/sscp/host/scan_exclusive.cpp    | 165 ++++++
 src/libkernel/sscp/host/scan_inclusive.cpp    | 154 +++++
 src/libkernel/sscp/host/shuffle.cpp           |  53 ++
 src/libkernel/sscp/ptx/CMakeLists.txt         |  19 +-
 src/libkernel/sscp/ptx/broadcast.cpp          |  38 ++
 src/libkernel/sscp/ptx/collpredicate.cpp      |  47 ++
 src/libkernel/sscp/ptx/reduction.cpp          | 152 +++++
 src/libkernel/sscp/ptx/scan_exclusive.cpp     | 166 ++++++
 src/libkernel/sscp/ptx/scan_inclusive.cpp     | 155 +++++
 src/libkernel/sscp/ptx/shuffle.cpp            | 121 ++++
 src/libkernel/sscp/spirv/CMakeLists.txt       |  19 +-
 src/libkernel/sscp/spirv/broadcast.cpp        |  38 ++
 src/libkernel/sscp/spirv/collpredicate.cpp    |  47 ++
 src/libkernel/sscp/spirv/reduction.cpp        | 148 +++++
 src/libkernel/sscp/spirv/scan_exclusive.cpp   | 162 ++++++
 src/libkernel/sscp/spirv/scan_inclusive.cpp   | 151 +++++
 src/libkernel/sscp/spirv/shuffle.cpp          | 134 +++++
 45 files changed, 4178 insertions(+), 268 deletions(-)
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/core_typed.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp
 create mode 100644 include/hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp
 create mode 100644 src/libkernel/sscp/amdgpu/broadcast.cpp
 create mode 100644 src/libkernel/sscp/amdgpu/collpredicate.cpp
 create mode 100644 src/libkernel/sscp/amdgpu/reduction.cpp
 create mode 100644 src/libkernel/sscp/amdgpu/scan_exclusive.cpp
 create mode 100644 src/libkernel/sscp/amdgpu/scan_inclusive.cpp
 create mode 100644 src/libkernel/sscp/amdgpu/shuffle.cpp
 create mode 100644 src/libkernel/sscp/host/broadcast.cpp
 create mode 100644 src/libkernel/sscp/host/collpredicate.cpp
 create mode 100644 src/libkernel/sscp/host/reduction.cpp
 create mode 100644 src/libkernel/sscp/host/scan_exclusive.cpp
 create mode 100644 src/libkernel/sscp/host/scan_inclusive.cpp
 create mode 100644 src/libkernel/sscp/host/shuffle.cpp
 create mode 100644 src/libkernel/sscp/ptx/broadcast.cpp
 create mode 100644 src/libkernel/sscp/ptx/collpredicate.cpp
 create mode 100644 src/libkernel/sscp/ptx/reduction.cpp
 create mode 100644 src/libkernel/sscp/ptx/scan_exclusive.cpp
 create mode 100644 src/libkernel/sscp/ptx/scan_inclusive.cpp
 create mode 100644 src/libkernel/sscp/ptx/shuffle.cpp
 create mode 100644 src/libkernel/sscp/spirv/broadcast.cpp
 create mode 100644 src/libkernel/sscp/spirv/collpredicate.cpp
 create mode 100644 src/libkernel/sscp/spirv/reduction.cpp
 create mode 100644 src/libkernel/sscp/spirv/scan_exclusive.cpp
 create mode 100644 src/libkernel/sscp/spirv/scan_inclusive.cpp
 create mode 100644 src/libkernel/sscp/spirv/shuffle.cpp

diff --git a/include/hipSYCL/sycl/libkernel/group_functions.hpp b/include/hipSYCL/sycl/libkernel/group_functions.hpp
index 6df05afb5..22224f677 100644
--- a/include/hipSYCL/sycl/libkernel/group_functions.hpp
+++ b/include/hipSYCL/sycl/libkernel/group_functions.hpp
@@ -186,7 +186,8 @@ OutPtr joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
                                           first, last, result, binary_op);
 }
 
-template<class Group, typename V, typename T, typename BinaryOperation>
+template<class Group, typename V, typename T, typename BinaryOperation,
+          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
 HIPSYCL_BUILTIN
 T exclusive_scan_over_group(Group g, V x, T init, BinaryOperation binary_op) {
   HIPSYCL_RETURN_DISPATCH_GROUP_ALGORITHM(__acpp_exclusive_scan_over_group,
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp
index dc9c1c490..996ca4977 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp
@@ -8,11 +8,10 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include "builtin_config.hpp"
-
 #ifndef HIPSYCL_SSCP_BROADCAST_BUILTINS_HPP
 #define HIPSYCL_SSCP_BROADCAST_BUILTINS_HPP
 
+#include "builtin_config.hpp"
 
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_work_group_broadcast_i8(__acpp_int32 sender,
@@ -29,7 +28,6 @@ HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int64 __acpp_sscp_work_group_broadcast_i64(__acpp_int32 sender,
                                                         __acpp_int64 x);
 
-
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_sub_group_broadcast_i8(__acpp_int32 sender,
                                                      __acpp_int8 x);
@@ -45,4 +43,6 @@ HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int64 __acpp_sscp_sub_group_broadcast_i64(__acpp_int32 sender,
                                                        __acpp_int64 x);
 
+
 #endif
+
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp
index bc19c166c..39b678bb5 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp
@@ -12,6 +12,7 @@
 #define HIPSYCL_SSCP_BUILTINS_CORE_HPP
 
 #include "builtin_config.hpp"
+#include "core_typed.hpp"
 
 #include <stddef.h>
 
@@ -38,171 +39,6 @@ HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_z();
 HIPSYCL_SSCP_BUILTIN bool
 __acpp_sscp_if_global_sizes_fit_in_int();
 
-template<int Dim, class T>
-T __acpp_sscp_typed_get_global_linear_id() {
-  if constexpr(Dim == 1) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-
-    return gid_x * lsize_x + lid_x;
-  } else if constexpr(Dim == 2) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lsize_y = (T)__acpp_sscp_get_local_size_y();
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-
-    T id_x = gid_x * lsize_x + lid_x; 
-    T id_y = gid_y * lsize_y + lid_y;
-
-    T global_size_x = lsize_x * ngroups_x;
-
-    return global_size_x * id_y + id_x;
-  } else if constexpr(Dim == 3) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-    T gid_z = (T)__acpp_sscp_get_group_id_z();
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lsize_y = (T)__acpp_sscp_get_local_size_y();
-    T lsize_z = (T)__acpp_sscp_get_local_size_z();
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-    T lid_z = (T)__acpp_sscp_get_local_id_z();
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
-    
-    T id_x = gid_x * lsize_x + lid_x;
-    T id_y = gid_y * lsize_y + lid_y;
-    T id_z = gid_z * lsize_z + lid_z;
-
-    T global_size_x = lsize_x * ngroups_x;
-    T global_size_y = lsize_y * ngroups_y;
-
-    return global_size_x * global_size_y * id_z + global_size_x * id_y + id_x;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_local_linear_id() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_local_id_x();
-  } else if constexpr(Dim == 2) {
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-
-    return lsize_x * lid_y + lid_x;
-  } else if constexpr(Dim == 3) {
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-    T lid_z = (T)__acpp_sscp_get_local_id_z();
-
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lsize_y = (T)__acpp_sscp_get_local_size_y();
-
-    return lsize_x * lsize_y * lid_z + lsize_x * lid_y + lid_x;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_group_linear_id() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_group_id_x();
-  } else if constexpr(Dim == 2) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-
-    return ngroups_x * gid_y + gid_x;
-  } else if constexpr(Dim == 3) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-    T gid_z = (T)__acpp_sscp_get_group_id_z();
-
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
-
-    return ngroups_x * ngroups_y * gid_z + ngroups_x * gid_y + gid_x;
-  } else {
-    return 0;
-  }
-}
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_global_size() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
-  } else if constexpr(Dim == 2) {
-    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
-
-    return size_x * size_y;
-  } else if constexpr(Dim == 3) {
-    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
-    T size_z = (T)__acpp_sscp_get_local_size_z() * (T)__acpp_sscp_get_num_groups_z();
-
-    return size_x * size_y * size_z;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_local_size() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_local_size_x();
-  } else if constexpr(Dim == 2) {
-    T size_x = (T)__acpp_sscp_get_local_size_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y();
-
-    return size_x * size_y;
-  } else if constexpr(Dim == 3) {
-    T size_x = (T)__acpp_sscp_get_local_size_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y();
-    T size_z = (T)__acpp_sscp_get_local_size_z();
-
-    return size_x * size_y * size_z;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_num_groups() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_num_groups_x();
-  } else if constexpr(Dim == 2) {
-    T size_x = (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_num_groups_y();
-
-    return size_x * size_y;
-  } else if constexpr(Dim == 3) {
-    T size_x = (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_num_groups_y();
-    T size_z = (T)__acpp_sscp_get_num_groups_z();
-
-    return size_x * size_y * size_z;
-  } else {
-    return 0;
-  }
-}
-
-
-
 template<int Dim>
 size_t __acpp_sscp_get_global_linear_id() {
   if(__acpp_sscp_if_global_sizes_fit_in_int()) {
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/core_typed.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/core_typed.hpp
new file mode 100644
index 000000000..5d607e722
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/core_typed.hpp
@@ -0,0 +1,198 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_BUILTINS_CORE_TYPED_HPP
+#define HIPSYCL_SSCP_BUILTINS_CORE_TYPED_HPP
+
+#include "builtin_config.hpp"
+
+#include <stddef.h>
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_id_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_id_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_id_z();
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_group_id_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_group_id_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_group_id_z();
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_size_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_size_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_size_z();
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_z();
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_global_linear_id() {
+  if constexpr(Dim == 1) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+
+    return gid_x * lsize_x + lid_x;
+  } else if constexpr(Dim == 2) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lsize_y = (T)__acpp_sscp_get_local_size_y();
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+
+    T id_x = gid_x * lsize_x + lid_x; 
+    T id_y = gid_y * lsize_y + lid_y;
+
+    T global_size_x = lsize_x * ngroups_x;
+
+    return global_size_x * id_y + id_x;
+  } else if constexpr(Dim == 3) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+    T gid_z = (T)__acpp_sscp_get_group_id_z();
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lsize_y = (T)__acpp_sscp_get_local_size_y();
+    T lsize_z = (T)__acpp_sscp_get_local_size_z();
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+    T lid_z = (T)__acpp_sscp_get_local_id_z();
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
+    
+    T id_x = gid_x * lsize_x + lid_x;
+    T id_y = gid_y * lsize_y + lid_y;
+    T id_z = gid_z * lsize_z + lid_z;
+
+    T global_size_x = lsize_x * ngroups_x;
+    T global_size_y = lsize_y * ngroups_y;
+
+    return global_size_x * global_size_y * id_z + global_size_x * id_y + id_x;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_local_linear_id() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_local_id_x();
+  } else if constexpr(Dim == 2) {
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+
+    return lsize_x * lid_y + lid_x;
+  } else if constexpr(Dim == 3) {
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+    T lid_z = (T)__acpp_sscp_get_local_id_z();
+
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lsize_y = (T)__acpp_sscp_get_local_size_y();
+
+    return lsize_x * lsize_y * lid_z + lsize_x * lid_y + lid_x;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_group_linear_id() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_group_id_x();
+  } else if constexpr(Dim == 2) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+
+    return ngroups_x * gid_y + gid_x;
+  } else if constexpr(Dim == 3) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+    T gid_z = (T)__acpp_sscp_get_group_id_z();
+
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
+
+    return ngroups_x * ngroups_y * gid_z + ngroups_x * gid_y + gid_x;
+  } else {
+    return 0;
+  }
+}
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_global_size() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
+  } else if constexpr(Dim == 2) {
+    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
+
+    return size_x * size_y;
+  } else if constexpr(Dim == 3) {
+    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
+    T size_z = (T)__acpp_sscp_get_local_size_z() * (T)__acpp_sscp_get_num_groups_z();
+
+    return size_x * size_y * size_z;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_local_size() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_local_size_x();
+  } else if constexpr(Dim == 2) {
+    T size_x = (T)__acpp_sscp_get_local_size_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y();
+
+    return size_x * size_y;
+  } else if constexpr(Dim == 3) {
+    T size_x = (T)__acpp_sscp_get_local_size_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y();
+    T size_z = (T)__acpp_sscp_get_local_size_z();
+
+    return size_x * size_y * size_z;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_num_groups() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_num_groups_x();
+  } else if constexpr(Dim == 2) {
+    T size_x = (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_num_groups_y();
+
+    return size_x * size_y;
+  } else if constexpr(Dim == 3) {
+    T size_x = (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_num_groups_y();
+    T size_z = (T)__acpp_sscp_get_num_groups_z();
+
+    return size_x * size_y * size_z;
+  } else {
+    return 0;
+  }
+}
+
+
+#endif
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp
new file mode 100644
index 000000000..c5beda9da
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp
@@ -0,0 +1,41 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_BROADCAST_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_BROADCAST_BUILTINS_HPP
+
+#include "../barrier.hpp"
+#include "../broadcast.hpp"
+#include "../builtin_config.hpp"
+#include "../core_typed.hpp"
+#include "../shuffle.hpp"
+#include "utils.hpp"
+
+#undef ACPP_TEMPLATE_DECLARATION_WG_BROADCAST
+
+namespace hipsycl::libkernel::sscp {
+
+template <typename T, typename V> T wg_broadcast(__acpp_int32 sender, T x, V shrd_memory) {
+
+  if (sender == __acpp_sscp_typed_get_local_linear_id<3, int>()) {
+    shrd_memory[0] = x;
+  };
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  x = shrd_memory[0];
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  return x;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
new file mode 100644
index 000000000..bad77082c
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
@@ -0,0 +1,107 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_REDUCTION_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_REDUCTION_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+namespace {
+template <typename OutType, typename BinaryOperation>
+OutType sg_reduce_impl(OutType x, BinaryOperation binary_op, __acpp_int32 active_threads) {
+  const __acpp_uint32 lrange = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_uint32 lid = __acpp_sscp_get_subgroup_local_id();
+  const __acpp_uint64 subgroup_size = active_threads;
+  auto local_x = x;
+  for (__acpp_int32 i = lrange / 2; i > 0; i /= 2) {
+    auto other_x = bit_cast<OutType>(sg_select(
+        bit_cast<typename integer_type<OutType>::type>(local_x), lid + i));
+    if (lid + i < subgroup_size)
+      local_x = binary_op(local_x, other_x);
+  }
+  return bit_cast<OutType>(
+      sg_select(bit_cast<typename integer_type<OutType>::type>(local_x), 0));
+}
+} // namespace
+
+template <__acpp_sscp_algorithm_op binary_op, typename OutType> OutType sg_reduce(OutType x) {
+  using op = typename get_op<binary_op>::type;
+  const __acpp_uint32 lrange = __acpp_sscp_get_subgroup_size();
+  return sg_reduce_impl(x, op{}, lrange);
+}
+
+template <size_t shmem_array_length, typename OutType, typename MemoryType,
+          typename BinaryOperation>
+OutType wg_reduce(OutType x, BinaryOperation op, MemoryType *shrd_mem) {
+
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 wg_size = __acpp_sscp_typed_get_local_size<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+  const __acpp_int32 first_sg_size = wg_broadcast(0, sg_size, &shrd_mem[0]);
+
+  const __acpp_uint32 num_subgroups = (wg_size + max_sg_size - 1) / max_sg_size;
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  OutType local_reduce_result = sg_reduce_impl(x, op, sg_size);
+
+  // Sum up until all sgs can load their data into shmem
+  if (subgroup_id < shmem_array_length) {
+    shrd_mem[subgroup_id] = local_reduce_result;
+  }
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+
+  for (int i = shmem_array_length; i < num_subgroups; i += shmem_array_length) {
+    if (subgroup_id >= i && subgroup_id < i + shmem_array_length) {
+      shrd_mem[subgroup_id % shmem_array_length] =
+          op(local_reduce_result, shrd_mem[subgroup_id % shmem_array_length]);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+  }
+
+  // Now we are filled up shared memory with the results of all the subgroups
+  // We reduce in shared memory until it fits into one sg
+  size_t elements_in_shmem =
+      num_subgroups < shmem_array_length ? num_subgroups : shmem_array_length;
+  for (int i = shmem_array_length / 2; i >= first_sg_size; i /= 2) {
+    if (wg_lid < i && wg_lid + i < elements_in_shmem) {
+      shrd_mem[wg_lid] = op(shrd_mem[wg_lid + i], shrd_mem[wg_lid]);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+  }
+
+  // Now we load the data into registers
+  if (wg_lid < first_sg_size) {
+    local_reduce_result = shrd_mem[wg_lid];
+    int active_threads = num_subgroups < first_sg_size ? num_subgroups : first_sg_size;
+    local_reduce_result = sg_reduce_impl(local_reduce_result, op, active_threads);
+  }
+
+  // Do a final broadcast
+  using internal_type = typename integer_type<OutType>::type;
+  static_assert(sizeof(internal_type) == sizeof(OutType));
+  local_reduce_result = bit_cast<OutType>(
+      wg_broadcast(0, bit_cast<internal_type>(local_reduce_result), &shrd_mem[0]));
+  return local_reduce_result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp
new file mode 100644
index 000000000..9260d999e
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp
@@ -0,0 +1,97 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_GENEIC_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_GENEIC_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "scan_subgroup.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <int SharedMemorySize, bool ExclusiveScan, typename OutType, typename MemoryType,
+          typename BinaryOperation>
+OutType wg_generic_scan(OutType x, BinaryOperation op, MemoryType shrd_mem, OutType init = 0) {
+
+  // The last element of the shared memory is used to store the total sum for exclusive scans.
+  const size_t shmem_array_length = SharedMemorySize - 1;
+
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 wg_size = __acpp_sscp_typed_get_local_size<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+
+  const __acpp_uint32 num_subgroups = (wg_size + max_sg_size - 1) / max_sg_size;
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  const bool last_item_in_sg = (wg_lid % sg_size) == (sg_size - 1);
+  OutType sg_scan_result;
+  if constexpr (ExclusiveScan) {
+    sg_scan_result = sg_exclusive_scan(x, op, init);
+  } else {
+    sg_scan_result = sg_inclusive_scan(x, op);
+  }
+
+  for (int i = 0; i < (num_subgroups - 1 + shmem_array_length) / shmem_array_length; i++) {
+    __acpp_uint32 first_active_thread = i * num_subgroups * max_sg_size;
+    __acpp_uint32 last_active_thread = (i + 1) * num_subgroups * max_sg_size;
+    last_active_thread = last_active_thread > wg_size ? wg_size : last_active_thread;
+    __acpp_uint32 relative_thread_id = wg_lid - first_active_thread;
+    if (subgroup_id / shmem_array_length == i) {
+      if (last_item_in_sg) {
+
+        if constexpr (ExclusiveScan) {
+          shrd_mem[subgroup_id % shmem_array_length] = op(sg_scan_result, x);
+        } else {
+          shrd_mem[subgroup_id % shmem_array_length] = sg_scan_result;
+        }
+      }
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    // First shmem_array_length number of threads exclusive scan in shared memory
+    auto local_x = shrd_mem[relative_thread_id];
+    for (__acpp_int32 j = 1; j < shmem_array_length; j *= 2) {
+      __acpp_int32 next_id = relative_thread_id - j;
+      if (next_id >= 0 && j <= relative_thread_id) {
+        if (relative_thread_id < shmem_array_length) {
+          auto other_x = shrd_mem[next_id];
+          local_x = op(local_x, other_x);
+          shrd_mem[relative_thread_id] = local_x;
+        }
+      }
+      __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                     __acpp_sscp_memory_order::relaxed);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+
+    if (subgroup_id > 0) {
+      auto current_segment_update = shrd_mem[(subgroup_id % shmem_array_length) - 1];
+      sg_scan_result = op(current_segment_update, sg_scan_result);
+    }
+    if (i > 0) {
+      sg_scan_result = op(shrd_mem[shmem_array_length], sg_scan_result);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    shrd_mem[shmem_array_length] = sg_scan_result;
+  }
+  return sg_scan_result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp
new file mode 100644
index 000000000..111414f96
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp
@@ -0,0 +1,61 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_HIPLIKE_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_HIPLIKE_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "scan_subgroup.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <int SharedMemorySize, bool ExclusiveScan, typename OutType, typename MemoryType,
+          typename BinaryOperation>
+OutType wg_hiplike_scan(OutType x, BinaryOperation op, MemoryType shrd_mem, OutType init = 0) {
+
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  const bool last_item_in_sg = (wg_lid % sg_size) == (sg_size - 1);
+  OutType sg_scan_result;
+  if constexpr (ExclusiveScan) {
+    sg_scan_result = sg_exclusive_scan(x, op, init);
+  } else {
+    sg_scan_result = sg_inclusive_scan(x, op);
+  }
+
+  if (last_item_in_sg) {
+    if constexpr (ExclusiveScan) {
+      shrd_mem[subgroup_id] = op(sg_scan_result, x);
+    } else {
+      shrd_mem[subgroup_id] = sg_scan_result;
+    }
+  }
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  if (subgroup_id == 0) {
+    shrd_mem[wg_lid] = sg_inclusive_scan(shrd_mem[wg_lid], op);
+  }
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  return subgroup_id > 0 ? op(shrd_mem[subgroup_id - 1], sg_scan_result) : sg_scan_result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp
new file mode 100644
index 000000000..18dff327c
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp
@@ -0,0 +1,75 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_HOST_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_HOST_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "scan_subgroup.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <bool ExclusiveScan, typename OutType, typename MemoryType, typename BinaryOperation>
+OutType wg_host_scan(OutType x, BinaryOperation op, MemoryType shrd_mem, OutType init = 0) {
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 wg_size = __acpp_sscp_typed_get_local_size<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+
+  const __acpp_uint32 num_subgroups = (wg_size + max_sg_size - 1) / max_sg_size;
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  const bool last_item_in_sg = (wg_lid % sg_size) == (sg_size - 1);
+  OutType local_x;
+  if constexpr (ExclusiveScan) {
+    if (wg_lid + 1 < wg_size) {
+      shrd_mem[wg_lid + 1] = x;
+    } else {
+      shrd_mem[0] = init;
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    local_x = shrd_mem[wg_lid];
+  } else {
+    shrd_mem[wg_lid] = x;
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    local_x = x;
+  }
+
+  OutType other_x;
+  // TODO: Here we can just call the host inclusive scan
+  for (__acpp_int32 i = 1; i < wg_size; i *= 2) {
+    __acpp_int32 next_id = wg_lid - i;
+    bool is_nextid_valid = (next_id >= 0) && (i <= wg_lid);
+
+    if (is_nextid_valid) {
+      other_x = shrd_mem[next_id];
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+
+    if (is_nextid_valid) {
+      local_x = op(local_x, other_x);
+      shrd_mem[wg_lid] = local_x;
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+  }
+  return local_x;
+}
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
new file mode 100644
index 000000000..60386c8df
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
@@ -0,0 +1,53 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_SUBGROUP_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_SUBGROUP_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <typename T, typename BinaryOperation>
+T sg_inclusive_scan(T x, BinaryOperation binary_op) {
+  const __acpp_uint32 lid = __acpp_sscp_get_subgroup_local_id();
+  const __acpp_uint32 lrange = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_uint64 subgroup_size = __acpp_sscp_get_subgroup_size();
+  auto local_x = x;
+  for (__acpp_int32 i = 1; i < lrange; i *= 2) {
+    __acpp_uint32 next_id = lid - i;
+    auto other_x = bit_cast<T>(
+        sg_shift_right(bit_cast<typename integer_type<T>::type>(local_x), i));
+    if (next_id >= 0 && i <= lid)
+      local_x = binary_op(local_x, other_x);
+  }
+  return local_x;
+}
+
+template <typename T, typename BinaryOperation>
+T sg_exclusive_scan(T x, BinaryOperation binary_op, T init) {
+  const __acpp_uint32 lid = __acpp_sscp_get_subgroup_local_id();
+  const __acpp_uint64 subgroup_size = __acpp_sscp_get_subgroup_max_size();
+  x = lid == 0 ? binary_op(x, init) : x;
+  auto result_inclusive = sg_inclusive_scan(x, binary_op);
+  auto result = bit_cast<T>(sg_shift_right(
+      bit_cast<typename integer_type<T>::type>(result_inclusive), 1));
+  result = lid % subgroup_size == 0 ? init : result;
+  return result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp
new file mode 100644
index 000000000..ee0a2945c
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp
@@ -0,0 +1,75 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_DETAIL_SHUFFLE_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_SHUFFLE_BUILTINS_HPP
+
+#include "../builtin_config.hpp"
+#include "../shuffle.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <typename T> inline T sg_select(T, __acpp_int32) = delete;
+
+template <> inline __acpp_int8 sg_select<__acpp_int8>(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i8(value, id);
+}
+
+template <> inline __acpp_int16 sg_select<__acpp_int16>(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i16(value, id);
+}
+
+template <> inline __acpp_int32 sg_select<__acpp_int32>(__acpp_int32 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+template <> inline __acpp_int64 sg_select<__acpp_int64>(__acpp_int64 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i64(value, id);
+}
+
+template <typename T> T inline sg_shift_left(T, __acpp_int32) = delete;
+
+template <> __acpp_int8 inline sg_shift_left<__acpp_int8>(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i8(value, id);
+}
+
+template <> __acpp_int16 inline sg_shift_left<__acpp_int16>(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i16(value, id);
+}
+
+template <> __acpp_int32 inline sg_shift_left<__acpp_int32>(__acpp_int32 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i32(value, id);
+}
+
+template <> __acpp_int64 inline sg_shift_left<__acpp_int64>(__acpp_int64 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i64(value, id);
+}
+
+template <typename T> T inline sg_shift_right(T, __acpp_int32) = delete;
+
+template <> __acpp_int8 inline sg_shift_right<__acpp_int8>(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i8(value, id);
+}
+
+template <> __acpp_int16 inline sg_shift_right<__acpp_int16>(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i16(value, id);
+}
+
+template <> __acpp_int32 inline sg_shift_right<__acpp_int32>(__acpp_int32 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i32(value, id);
+}
+
+template <> __acpp_int64 inline sg_shift_right<__acpp_int64>(__acpp_int64 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i64(value, id);
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
new file mode 100644
index 000000000..78178184b
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
@@ -0,0 +1,96 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-
+#ifndef HIPSYCL_SSCP_UTILS_BUILTINS_HPP
+#define HIPSYCL_SSCP_UTILS_BUILTINS_HPP
+
+#include "../builtin_config.hpp"
+
+#define ACPP_SHMEM_ATTRIBUTE                                                                       \
+  static __attribute__((loader_uninitialized)) __attribute__((address_space(3)))
+
+namespace hipsycl::libkernel::sscp {
+
+template <class Tout, class Tin> Tout bit_cast(Tin x) {
+  Tout result;
+  result = __builtin_bit_cast(Tout, x);
+  return result;
+}
+
+struct plus {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs + rhs; }
+};
+
+struct min {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs < rhs ? lhs : rhs; }
+};
+
+struct max {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs < rhs ? rhs : lhs; }
+};
+
+struct multiply {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs * rhs; }
+};
+
+struct bit_and {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs & rhs; }
+};
+
+struct bit_or {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs | rhs; }
+};
+
+struct bit_xor {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs ^ rhs; }
+};
+
+struct logical_and {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs and rhs; }
+};
+
+struct logical_or {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs or rhs; }
+};
+
+template <__acpp_sscp_algorithm_op op> struct get_op {};
+
+#define MAP_SSCP_ALGORITHM_OP(sscp_algo_op, impl)                                                  \
+  template <> struct get_op<sscp_algo_op> { using type = impl; };
+
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::plus, plus)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::multiply, multiply)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::min, min)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::max, max)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::bit_and, bit_and)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::bit_or, bit_or)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::bit_xor, bit_xor)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::logical_and, logical_and)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::logical_or, logical_or)
+
+#undef MAP_SSCP_ALGORITHM_OP
+
+template <typename T> struct integer_type { using type = T; };
+
+template <> struct integer_type<__acpp_f32> { using type = __acpp_int32; };
+
+template <> struct integer_type<__acpp_f64> { using type = __acpp_int64; };
+
+template <> struct integer_type<__acpp_uint8> { using type = __acpp_int8; };
+
+template <> struct integer_type<__acpp_uint16> { using type = __acpp_int16; };
+
+template <> struct integer_type<__acpp_uint32> { using type = __acpp_int32; };
+
+template <> struct integer_type<__acpp_uint64> { using type = __acpp_int64; };
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp
index 1f1658262..c3653402e 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp
@@ -8,12 +8,10 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include "builtin_config.hpp"
-#include "hipSYCL/sycl/libkernel/detail/half_representation.hpp"
-
 #ifndef HIPSYCL_SSCP_REDUCTION_BUILTINS_HPP
 #define HIPSYCL_SSCP_REDUCTION_BUILTINS_HPP
 
+#include "builtin_config.hpp"
 
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
@@ -48,8 +46,6 @@ __acpp_f32 __acpp_sscp_work_group_reduce_f32(__acpp_sscp_algorithm_op op, __acpp
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_f64 __acpp_sscp_work_group_reduce_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
 
-
-
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
 
@@ -83,5 +79,4 @@ __acpp_f32 __acpp_sscp_sub_group_reduce_f32(__acpp_sscp_algorithm_op op, __acpp_
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_f64 __acpp_sscp_sub_group_reduce_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
 
-
 #endif
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp
new file mode 100644
index 000000000..1f74221f8
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp
@@ -0,0 +1,104 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_SCAN_EXCLUSIVE_BUILTINS_HPP
+#define HIPSYCL_SSCP_SCAN_EXCLUSIVE_BUILTINS_HPP
+
+#include "builtin_config.hpp"
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_work_group_exclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x,
+                                                     __acpp_int8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_work_group_exclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x,
+                                                       __acpp_int16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_work_group_exclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x,
+                                                       __acpp_int32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_work_group_exclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x,
+                                                       __acpp_int64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_work_group_exclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x,
+                                                      __acpp_uint8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_work_group_exclusive_scan_u16(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint16 x, __acpp_uint16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_work_group_exclusive_scan_u32(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint32 x, __acpp_uint32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_work_group_exclusive_scan_u64(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint64 x, __acpp_uint64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_work_group_exclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x,
+                                                     __acpp_f16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_work_group_exclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x,
+                                                     __acpp_f32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_work_group_exclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x,
+                                                     __acpp_f64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_exclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x,
+                                                    __acpp_int8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_exclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x,
+                                                      __acpp_int16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_exclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x,
+                                                      __acpp_int32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_exclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x,
+                                                      __acpp_int64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_sub_group_exclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x,
+                                                     __acpp_uint8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_sub_group_exclusive_scan_u16(__acpp_sscp_algorithm_op op, __acpp_uint16 x,
+                                                       __acpp_uint16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_sub_group_exclusive_scan_u32(__acpp_sscp_algorithm_op op, __acpp_uint32 x,
+                                                       __acpp_uint32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_sub_group_exclusive_scan_u64(__acpp_sscp_algorithm_op op, __acpp_uint64 x,
+                                                       __acpp_uint64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_sub_group_exclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x,
+                                                    __acpp_f16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_sub_group_exclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x,
+                                                    __acpp_f32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_sub_group_exclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x,
+                                                    __acpp_f64 init);
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp
new file mode 100644
index 000000000..9f443fead
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp
@@ -0,0 +1,88 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_SCAN_INCLUSIVE_BUILTINS_HPP
+#define HIPSYCL_SSCP_SCAN_INCLUSIVE_BUILTINS_HPP
+
+#include "builtin_config.hpp"
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_work_group_inclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_work_group_inclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_work_group_inclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_work_group_inclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_work_group_inclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_work_group_inclusive_scan_u16(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_work_group_inclusive_scan_u32(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_work_group_inclusive_scan_u64(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_work_group_inclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_work_group_inclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_work_group_inclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_inclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_inclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_inclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_inclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_sub_group_inclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_sub_group_inclusive_scan_u16(__acpp_sscp_algorithm_op op,
+                                                       __acpp_uint16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_sub_group_inclusive_scan_u32(__acpp_sscp_algorithm_op op,
+                                                       __acpp_uint32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_sub_group_inclusive_scan_u64(__acpp_sscp_algorithm_op op,
+                                                       __acpp_uint64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_sub_group_inclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_sub_group_inclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_sub_group_inclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp
index 66deaa2db..d343a09df 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp
@@ -8,11 +8,11 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include "builtin_config.hpp"
-
 #ifndef HIPSYCL_SSCP_SHUFFLE_BUILTINS_HPP
 #define HIPSYCL_SSCP_SHUFFLE_BUILTINS_HPP
 
+#include "builtin_config.hpp"
+
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value,
                                          __acpp_uint32 delta);
@@ -127,7 +127,6 @@ HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value,
                                               __acpp_int32 id);
 
-
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_work_group_select_i8(__acpp_int8 value,
                                              __acpp_int32 id);
diff --git a/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp b/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp
index 6c33126f7..57dedb02a 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp
@@ -34,6 +34,8 @@
 #include "builtins/broadcast.hpp"
 #include "builtins/collpredicate.hpp"
 #include "builtins/reduction.hpp"
+#include "builtins/scan_exclusive.hpp"
+#include "builtins/scan_inclusive.hpp"
 #include "builtins/shuffle.hpp"
 
 namespace hipsycl {
@@ -68,14 +70,11 @@ __acpp_group_barrier(sub_group g,
 // broadcast
 
 template <int Dim, typename T>
-HIPSYCL_BUILTIN 
-T __acpp_group_broadcast(
-    group<Dim> g, T x,
-    typename group<Dim>::linear_id_type local_linear_id = 0) {
+HIPSYCL_BUILTIN std::enable_if_t<sizeof(T) <= 8, T>
+__acpp_group_broadcast(group<Dim> g, T x, typename group<Dim>::linear_id_type local_linear_id = 0) {
 
-  
   if constexpr(sizeof(T) == 1) {
-    return maybe_bit_cast<T>(__acpp_sscp_work_group_broadcast_i8(
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_broadcast_i8(    
         static_cast<__acpp_int32>(local_linear_id),
         maybe_bit_cast<__acpp_int8>(x)));
   } else if constexpr(sizeof(T) == 2) {
@@ -93,18 +92,9 @@ T __acpp_group_broadcast(
   }
 }
 
-template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_group_broadcast(
-    group<Dim> g, T x, typename group<Dim>::id_type local_id) {
-
-  const auto sender_lid = linear_id<g.dimensions>::get(
-      local_id, g.get_local_range());
-  return __acpp_group_broadcast(g, x, static_cast<int>(sender_lid));
-}
-
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_group_broadcast(
-    sub_group g, T x, typename sub_group::linear_id_type local_linear_id = 0) {
+HIPSYCL_BUILTIN std::enable_if_t<sizeof(T) <= 8, T>
+__acpp_group_broadcast(sub_group g, T x, typename sub_group::linear_id_type local_linear_id = 0) {
 
   // Song recommendation: Leaves' Eyes - Angel and the Ghost
   //
@@ -138,11 +128,12 @@ T __acpp_group_broadcast(sub_group g, T x,
   return __acpp_group_broadcast(g, x, static_cast<int>(local_id[0]));
 }
 
-template <class Group, typename T, int N>
+
+template<typename T, int N, class Group>
 HIPSYCL_BUILTIN
-std::enable_if_t<(sizeof(vec<T, N>) > 8), vec<T,N>> 
+std::enable_if_t<(sizeof(vec<T,N>) > 8), vec<T,N>>
 __acpp_group_broadcast(
-    Group g, vec<T, N> x,
+    Group g, vec<T,N> x,
     typename Group::linear_id_type local_linear_id = 0) {
   vec<T, N> result;
   for (int i = 0; i < N; ++i) {
@@ -151,11 +142,11 @@ __acpp_group_broadcast(
   return result;
 }
 
-template <class Group, typename T, int N>
+template<class Group, typename T, int N>
 HIPSYCL_BUILTIN
-std::enable_if_t<(sizeof(marray<T, N>) > 8), marray<T,N>>
+std::enable_if_t<(sizeof(marray<T,N>) > 8), marray<T,N>>
 __acpp_group_broadcast(
-    Group g, marray<T, N> x,
+    Group g, marray<T,N> x,
     typename Group::linear_id_type local_linear_id = 0) {
   marray<T, N> result;
   for (int i = 0; i < N; ++i) {
@@ -164,6 +155,14 @@ __acpp_group_broadcast(
   return result;
 }
 
+template <int Dim, typename T>
+HIPSYCL_BUILTIN T __acpp_group_broadcast(
+    group<Dim> g, T x, typename group<Dim>::id_type local_id) {
+
+  const auto sender_lid = linear_id<g.dimensions>::get(
+      local_id, g.get_local_range());
+  return __acpp_group_broadcast(g, x, static_cast<int>(sender_lid));
+}
 
 // any_of
 
@@ -334,10 +333,10 @@ HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::bit_or, T{0})
 HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::bit_xor, T{0})
 HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::logical_and, T{1})
 HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::logical_or, T{0})
-
+// ---- subgroup
 template <
     typename T, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T>), int> = 0>
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
 HIPSYCL_BUILTIN T __acpp_reduce_over_group(sub_group g, T x,
                                               BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
@@ -361,7 +360,7 @@ HIPSYCL_BUILTIN T __acpp_reduce_over_group(sub_group g, T x,
 
 template <
     typename T, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T>), int> = 0>
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
 HIPSYCL_BUILTIN T __acpp_reduce_over_group(sub_group g, T x,
                                               BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
@@ -424,12 +423,12 @@ marray<T,N> __acpp_reduce_over_group(sub_group g, marray<T,N> x, BinaryOperation
   return result;
 }
 
+// End of subgroup algos
 
 template <
     typename T, int Dim, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T>), int> = 0>
-HIPSYCL_BUILTIN
-T __acpp_reduce_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_reduce_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_work_group_reduce_i8(
         sscp_binary_operation_v<BinaryOperation>,
@@ -451,7 +450,7 @@ T __acpp_reduce_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
 
 template <
     typename T, int Dim, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T>), int> = 0>
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
 HIPSYCL_BUILTIN T __acpp_reduce_over_group(group<Dim> g, T x,
                                               BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
@@ -537,8 +536,7 @@ __acpp_joint_reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
 
   using type = decltype(*first);
 
-  auto local = sscp_binary_operation_identity<
-      std::decay_t<type>, sscp_binary_operation_v<BinaryOperation>>::get();
+  auto local = sscp_binary_operation_identity<std::decay_t<type>, sscp_binary_operation_v<BinaryOperation>>::get();
   if(start_ptr < last)
     local = *start_ptr;
   
@@ -556,74 +554,442 @@ T __acpp_joint_reduce(Group g, Ptr first, Ptr last, T init,
   return binary_op(__acpp_joint_reduce(g, first, last, binary_op), init);
 }
 
-// exclusive_scan
 
+// subgroup inclusive_scan
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x)));
+  }
+}
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x)));
+  }
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_inclusive_scan_over_group(sub_group g, half x,
+                                                      BinaryOperation binary_op) {
+  return detail::create_half(__acpp_sscp_sub_group_inclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x)));
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_inclusive_scan_over_group(sub_group g, float x,
+                                                       BinaryOperation binary_op) {
+  return __acpp_sscp_sub_group_inclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x);
+}
 
-template <int Dim, typename V, typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(
-    group<Dim> g, V x, T init, BinaryOperation binary_op);
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN double __acpp_inclusive_scan_over_group(sub_group g, double x,
+                                                        BinaryOperation binary_op) {
+  return __acpp_sscp_sub_group_inclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x);
+}
 
+// group inclusive scan
 
-template <typename V, typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(
-    sub_group g, V x, T init, BinaryOperation binary_op);
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x)));
+  }
+}
 
-template <typename Group, typename T, typename BinaryOperation,
-          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN T
-__acpp_exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op);
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x)));
+  }
+}
 
-template <typename Group, typename InPtr, typename OutPtr, typename T,
-          typename BinaryOperation,
-          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               T init, BinaryOperation binary_op);
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_inclusive_scan_over_group(group<Dim> g, half x,
+                                                      BinaryOperation binary_op) {
+  return detail::create_half(__acpp_sscp_work_group_inclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x)));
+}
 
-template <typename Group, typename InPtr, typename OutPtr,
-          typename BinaryOperation,
-          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               BinaryOperation binary_op);
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_inclusive_scan_over_group(group<Dim> g, float x,
+                                                       BinaryOperation binary_op) {
+  return __acpp_sscp_work_group_inclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x);
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN double __acpp_inclusive_scan_over_group(group<Dim> g, double x,
+                                                        BinaryOperation binary_op) {
+  return __acpp_sscp_work_group_inclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x);
+}
 
-// inclusive_scan
+template <typename Group, typename T, int N, typename BinaryOperation>
+HIPSYCL_BUILTIN vec<T, N> __acpp_inclusive_scan_over_group(Group g, vec<T, N> x,
+                                                           BinaryOperation binary_op) {
+  vec<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_inclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
 
-template <typename Group, typename InPtr, typename OutPtr, typename T,
-          typename BinaryOperation,
+template <typename Group, typename T, int N, typename BinaryOperation>
+HIPSYCL_BUILTIN marray<T, N> __acpp_inclusive_scan_over_group(Group g, marray<T, N> x,
+                                                              BinaryOperation binary_op) {
+  marray<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_inclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
+
+template <class Group, typename V, typename T, typename BinaryOperation>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(Group g, V x, T init,
+                                                   BinaryOperation binary_op) {
+  const size_t lid = g.get_local_linear_id();
+  x = lid == 0 ? binary_op(init, x) : x;
+  __acpp_group_barrier(g);
+  x = __acpp_inclusive_scan_over_group(g, x, binary_op);
+  __acpp_group_barrier(g);
+  return x;
+}
+
+template <typename Group, typename InPtr, typename OutPtr, typename BinaryOperation,
           std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               BinaryOperation binary_op, T init);
+HIPSYCL_BUILTIN OutPtr __acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   BinaryOperation binary_op) {
+  const size_t lrange = g.get_local_range().size();
+  const size_t num_elements = last - first;
+  const size_t lid = g.get_local_linear_id();
+  using value_type = std::remove_reference_t<decltype(*first)>;
+
+  if (num_elements == 0)
+    return result;
+
+  if (num_elements == 1) {
+    *result = *first;
+    return result;
+  }
+
+  // Ptr start_ptr = first + lid;
+  using type = decltype(*first);
+  auto identity = sscp_binary_operation_identity<std::decay_t<type>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  size_t segment = 0;
+  size_t num_segments = (num_elements + lrange - 1) / lrange;
+
+  // for (Ptr p = start_ptr + lrange; p < last; p += lrange){
+  for (size_t segment = 0; segment < num_segments; segment++) {
+    size_t element_idx = segment * lrange + lid;
+    auto local_element = element_idx < num_elements ? first[element_idx] : identity;
+    auto segment_result = __acpp_inclusive_scan_over_group(g, local_element, binary_op);
+    if (element_idx < num_elements) {
+      result[element_idx] = segment_result;
+    }
+    __acpp_group_barrier(g);
+
+    if (segment > 0) {
+      auto update_value = result[segment * lrange - 1];
+      if (element_idx < num_elements) {
+        result[element_idx] = binary_op(update_value, result[element_idx]);
+      }
+    }
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
 
-template <typename Group, typename InPtr, typename OutPtr,
-          typename BinaryOperation,
+template <typename Group, typename InPtr, typename OutPtr, typename T, typename BinaryOperation,
           std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               BinaryOperation binary_op);
+HIPSYCL_BUILTIN OutPtr __acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   BinaryOperation binary_op, T init) {
 
-template <int Dim, typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN
-T __acpp_inclusive_scan_over_group(
-    group<Dim> g, T x, BinaryOperation binary_op);
+  const size_t lrange = g.get_local_range().size();
+  const size_t num_elements = last - first;
+  const size_t lid = g.get_local_linear_id();
+
+  if (lid == 0 && num_elements > 0) {
+    first[0] = binary_op(first[0], init);
+  }
+  __acpp_group_barrier(g);
+  OutPtr updated_result = __acpp_joint_inclusive_scan(g, first, last, result, binary_op);
+  __acpp_group_barrier(g);
+  return updated_result;
+}
+
+// exclusive_scan -- subgroup
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x), identity));
+  }
+}
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x), identity));
+  }
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_exclusive_scan_over_group(sub_group g, half x,
+                                                      BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<half>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return detail::create_half(__acpp_sscp_sub_group_exclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x), identity));
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_exclusive_scan_over_group(sub_group g, float x,
+                                                       BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<float>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_sub_group_exclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x,
+                                                  identity);
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_exclusive_scan_over_group(sub_group g, double x,
+                                                       BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<double>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_sub_group_exclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x,
+                                                  identity);
+}
+
+// // exclusive scan group
+
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x), identity));
+  }
+}
+
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x), identity));
+  }
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_exclusive_scan_over_group(group<Dim> g, half x,
+                                                      BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<half>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return detail::create_half(__acpp_sscp_work_group_exclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x), identity));
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_exclusive_scan_over_group(group<Dim> g, float x,
+                                                       BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<float>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_work_group_exclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x,
+                                                   identity);
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN double __acpp_exclusive_scan_over_group(group<Dim> g, double x,
+                                                        BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<double>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_work_group_exclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x,
+                                                   identity);
+}
+
+template <typename T, int N, typename BinaryOperation, class Group>
+HIPSYCL_BUILTIN vec<T, N> __acpp_exclusive_scan_over_group(Group g, vec<T, N> x,
+                                                           BinaryOperation binary_op) {
+  vec<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_exclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
+
+template <typename T, int N, typename BinaryOperation, class Group>
+HIPSYCL_BUILTIN marray<T, N> __acpp_exclusive_scan_over_group(Group g, marray<T, N> x,
+                                                              BinaryOperation binary_op) {
+  marray<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_exclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
 
-template <typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(
-    sub_group g, T x, BinaryOperation binary_op);
+template <class Group, typename V, typename T, typename BinaryOperation>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(Group g, V x, T init,
+                                                   BinaryOperation binary_op) {
+  const size_t lid = g.get_local_linear_id();
+  auto identity = sscp_binary_operation_identity<std::decay_t<V>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  x = lid == 0 ? binary_op(init, x) : x;
+  __acpp_group_barrier(g);
+  x = __acpp_exclusive_scan_over_group(g, x, binary_op);
+  __acpp_group_barrier(g);
+  if (lid == 0) {
+    x = init;
+  }
+  return x;
+}
 
-template <typename Group, typename V, typename T, typename BinaryOperation,
+template <typename Group, typename InPtr, typename OutPtr, typename BinaryOperation,
           std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(
-    Group g, V x, T init, BinaryOperation binary_op) {
-  auto scan = __acpp_inclusive_scan_over_group(g, T{x}, binary_op);
-  return binary_op(scan, init);
+HIPSYCL_BUILTIN OutPtr __acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   BinaryOperation binary_op) {
+  const size_t lid = g.get_local_linear_id();
+  __acpp_joint_inclusive_scan(g, first, last - 1, result + 1, binary_op);
+  __acpp_group_barrier(g);
+  using type = decltype(*first);
+  auto identity = sscp_binary_operation_identity<std::decay_t<type>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if (lid == 0) {
+    result[0] = identity;
+  }
+  __acpp_group_barrier(g);
+
+  return result;
+}
+
+template <typename Group, typename InPtr, typename OutPtr, typename T, typename BinaryOperation,
+          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
+HIPSYCL_BUILTIN OutPtr __acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   T init, BinaryOperation binary_op) {
+
+  const size_t lrange = g.get_local_range().size();
+  const size_t num_elements = last - first;
+  const size_t lid = g.get_local_linear_id();
+  __acpp_group_barrier(g);
+  if (lid == 0 && num_elements > 0) {
+    first[0] = binary_op(first[0], init);
+    result[0] = init;
+  }
+  __acpp_group_barrier(g);
+  OutPtr updated_result = __acpp_joint_inclusive_scan(g, first, last - 1, result + 1, binary_op);
+  __acpp_group_barrier(g);
+  return updated_result;
 }
 
 // shift_left
 template <int Dim, typename T>
 HIPSYCL_BUILTIN
-T __acpp_shift_group_left(
+std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_left(
     group<Dim> g, T x, typename group<Dim>::linear_id_type delta = 1) {
 
   if constexpr(sizeof(T) == 1) {
@@ -642,7 +1008,7 @@ T __acpp_shift_group_left(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_shift_group_left(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_left(
     sub_group g, T x, typename sub_group::linear_id_type delta = 1) {
 
   if constexpr(sizeof(T) == 1) {
@@ -667,6 +1033,7 @@ __acpp_shift_group_left(Group g, vec<T,N> x, typename Group::linear_id_type delt
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_left(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -678,13 +1045,14 @@ __acpp_shift_group_left(Group g, marray<T,N> x, typename Group::linear_id_type d
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_left(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
 
 // shift_right
 template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_shift_group_right(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_right(
     group<Dim> g, T x, typename group<Dim>::linear_id_type delta = 1) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_work_group_shr_i8(
@@ -702,7 +1070,7 @@ HIPSYCL_BUILTIN T __acpp_shift_group_right(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_shift_group_right(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_right(
     sub_group g, T x, typename sub_group::linear_id_type delta = 1) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_sub_group_shr_i8(
@@ -727,6 +1095,7 @@ __acpp_shift_group_right(Group g, vec<T,N> x, typename Group::linear_id_type del
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_right(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -738,13 +1107,14 @@ __acpp_shift_group_right(Group g, marray<T,N> x, typename Group::linear_id_type
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_right(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
 
 // permute_group_by_xor
 template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_permute_group_by_xor(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_permute_group_by_xor(
     group<Dim> g, T x, typename group<Dim>::linear_id_type mask) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_work_group_permute_i8(
@@ -762,7 +1132,7 @@ HIPSYCL_BUILTIN T __acpp_permute_group_by_xor(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_permute_group_by_xor(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_permute_group_by_xor(
     sub_group g, T x, typename sub_group::linear_id_type mask) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_sub_group_permute_i8(
@@ -786,6 +1156,7 @@ __acpp_permute_group_by_xor(Group g, vec<T,N> x, typename Group::linear_id_type
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_permute_group_by_xor(g, x[i], mask);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -797,13 +1168,14 @@ __acpp_permute_group_by_xor(Group g, marray<T,N> x, typename Group::linear_id_ty
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_permute_group_by_xor(g, x[i], mask);
+    __acpp_group_barrier(g);
   }
   return result;
 }
 
 // select_from_group
 template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_select_from_group(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_select_from_group(
     group<Dim> g, T x, typename group<Dim>::id_type remote_local_id) {
 
   __acpp_int32 linear_id = static_cast<__acpp_int32>(
@@ -825,7 +1197,7 @@ HIPSYCL_BUILTIN T __acpp_select_from_group(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_select_from_group(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_select_from_group(
     sub_group g, T x, typename sub_group::id_type remote_local_id) {
 
   __acpp_int32 linear_id = static_cast<__acpp_int32>(remote_local_id[0]);
@@ -853,6 +1225,7 @@ __acpp_select_from_group(Group g, vec<T,N> x, typename Group::id_type remote_loc
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_select_from_group(g, x[i], remote_local_id);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -864,6 +1237,7 @@ __acpp_select_from_group(Group g, marray<T,N> x, typename Group::id_type remote_
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_select_from_group(g, x[i], remote_local_id);
+    __acpp_group_barrier(g);
   }
   return result;
 }
diff --git a/src/libkernel/sscp/amdgpu/CMakeLists.txt b/src/libkernel/sscp/amdgpu/CMakeLists.txt
index da6a56050..44f34354b 100644
--- a/src/libkernel/sscp/amdgpu/CMakeLists.txt
+++ b/src/libkernel/sscp/amdgpu/CMakeLists.txt
@@ -2,6 +2,23 @@ if(WITH_LLVM_TO_AMDGPU_AMDHSA)
   libkernel_generate_bitcode_target(
       TARGETNAME amdgpu-amdhsa 
       TRIPLE amdgcn-amd-amdhsa
-      SOURCES atomic.cpp barrier.cpp core.cpp half.cpp integer.cpp math.cpp native.cpp print.cpp relational.cpp subgroup.cpp localmem.cpp
+      SOURCES 
+      atomic.cpp 
+      barrier.cpp 
+      core.cpp
+      half.cpp
+      integer.cpp
+      math.cpp
+      native.cpp
+      print.cpp
+      relational.cpp
+      subgroup.cpp
+      localmem.cpp
+      shuffle.cpp
+      collpredicate.cpp
+      reduction.cpp
+      broadcast.cpp
+      scan_inclusive.cpp
+      scan_exclusive.cpp
       ADDITIONAL_ARGS -nogpulib)
 endif()
diff --git a/src/libkernel/sscp/amdgpu/broadcast.cpp b/src/libkernel/sscp/amdgpu/broadcast.cpp
new file mode 100644
index 000000000..c465eb57d
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/broadcast.cpp
@@ -0,0 +1,38 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+#define ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                                \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##input_type shrd_x[1];                                            \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, &shrd_x[0]);                          \
+  }
+
+ACPP_WORKGROUP_BCAST(i8, int8)
+ACPP_WORKGROUP_BCAST(i16, int16)
+ACPP_WORKGROUP_BCAST(i32, int32)
+ACPP_WORKGROUP_BCAST(i64, int64)
+
+ACPP_SUBGROUP_BCAST(i8, int8)
+ACPP_SUBGROUP_BCAST(i16, int16)
+ACPP_SUBGROUP_BCAST(i32, int32)
+ACPP_SUBGROUP_BCAST(i64, int64)
\ No newline at end of file
diff --git a/src/libkernel/sscp/amdgpu/collpredicate.cpp b/src/libkernel/sscp/amdgpu/collpredicate.cpp
new file mode 100644
index 000000000..102f4e53f
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/collpredicate.cpp
@@ -0,0 +1,47 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __ockl_wfany_i32(pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/amdgpu/reduction.cpp b/src/libkernel/sscp/amdgpu/reduction.cpp
new file mode 100644
index 000000000..6ba3e85f9
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/reduction.cpp
@@ -0,0 +1,148 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/amdgpu/scan_exclusive.cpp b/src/libkernel/sscp/amdgpu/scan_exclusive.cpp
new file mode 100644
index 000000000..ec319c79d
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/scan_exclusive.cpp
@@ -0,0 +1,162 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0], init);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0], init);                         \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0], init);                          \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/amdgpu/scan_inclusive.cpp b/src/libkernel/sscp/amdgpu/scan_inclusive.cpp
new file mode 100644
index 000000000..9efa6ddeb
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/scan_inclusive.cpp
@@ -0,0 +1,151 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/amdgpu/shuffle.cpp b/src/libkernel/sscp/amdgpu/shuffle.cpp
new file mode 100644
index 000000000..cf86a1889
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/shuffle.cpp
@@ -0,0 +1,136 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/subgroup.hpp"
+
+namespace detail {
+static inline unsigned int __lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+} // namespace detail
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shl_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shl_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  auto sg_size = __acpp_sscp_get_subgroup_max_size();
+  int self = detail::__lane_id();
+  int index = (self + delta);
+  index = (int)((self & (sg_size - 1)) + delta) > sg_size ? self : index;
+
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shl_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shl_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shl_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shr_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shr_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shr_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  int self = detail::__lane_id();
+  int width = __acpp_sscp_get_subgroup_max_size();
+  int index = self - delta;
+  index = (index < (self & ~(width - 1))) ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shr_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shr_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shr_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_permute_i8(__acpp_int8 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_permute_i16(__acpp_int16 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_permute_i32(__acpp_int32 value, __acpp_int32 mask) {
+  int self = detail::__lane_id();
+  int index = self ^ mask;
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_permute_i64(__acpp_int64 value, __acpp_int32 mask) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_permute_i32(tmp[0], mask);
+  tmp[1] = __acpp_sscp_sub_group_permute_i32(tmp[1], mask);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_select_i8(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_select_i16(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_select_i32(__acpp_int32 value, __acpp_int32 id) {
+  int max_subgroup_size = __acpp_sscp_get_subgroup_max_size();
+  int index = id % max_subgroup_size;
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value, __acpp_int32 id) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_select_i32(tmp[0], id);
+  tmp[1] = __acpp_sscp_sub_group_select_i32(tmp[1], id);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
diff --git a/src/libkernel/sscp/host/CMakeLists.txt b/src/libkernel/sscp/host/CMakeLists.txt
index 35e7e43ff..dae895689 100644
--- a/src/libkernel/sscp/host/CMakeLists.txt
+++ b/src/libkernel/sscp/host/CMakeLists.txt
@@ -11,10 +11,16 @@ if(WITH_LLVM_TO_HOST)
     half.cpp
     math.cpp
     native.cpp
+    shuffle.cpp
     print.cpp
     relational.cpp
     localmem.cpp
-    subgroup.cpp)
+    subgroup.cpp
+    reduction.cpp
+    broadcast.cpp
+    scan_inclusive.cpp
+    scan_exclusive.cpp
+    collpredicate.cpp)
 
   libkernel_generate_bitcode_target(
       TARGETNAME host 
diff --git a/src/libkernel/sscp/host/broadcast.cpp b/src/libkernel/sscp/host/broadcast.cpp
new file mode 100644
index 000000000..c7df83746
--- /dev/null
+++ b/src/libkernel/sscp/host/broadcast.cpp
@@ -0,0 +1,40 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define HOST_ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                           \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    __acpp_##input_type *shrd_x =                                                                  \
+        static_cast<__acpp_##input_type *>(__acpp_sscp_host_get_internal_local_memory());          \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, shrd_x);                              \
+  }
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+HOST_ACPP_WORKGROUP_BCAST(i8, int8)
+HOST_ACPP_WORKGROUP_BCAST(i16, int16)
+HOST_ACPP_WORKGROUP_BCAST(i32, int32)
+HOST_ACPP_WORKGROUP_BCAST(i64, int64)
+
+ACPP_SUBGROUP_BCAST(i8, int8)
+ACPP_SUBGROUP_BCAST(i16, int16)
+ACPP_SUBGROUP_BCAST(i32, int32)
+ACPP_SUBGROUP_BCAST(i64, int64)
diff --git a/src/libkernel/sscp/host/collpredicate.cpp b/src/libkernel/sscp/host/collpredicate.cpp
new file mode 100644
index 000000000..6138fedb4
--- /dev/null
+++ b/src/libkernel/sscp/host/collpredicate.cpp
@@ -0,0 +1,48 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#define ACPP_SSCP_OMP_LIBKERNEL
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/host/reduction.cpp b/src/libkernel/sscp/host/reduction.cpp
new file mode 100644
index 000000000..daf220a66
--- /dev/null
+++ b/src/libkernel/sscp/host/reduction.cpp
@@ -0,0 +1,152 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, shrd_mem);                                          \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, shrd_mem);                                           \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, shrd_mem);                                           \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, shrd_mem);                                          \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, shrd_mem);                                           \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, shrd_mem);                                           \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, shrd_mem);                                       \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, shrd_mem);                                        \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, shrd_mem);                                       \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, shrd_mem);                                   \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, shrd_mem);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/host/scan_exclusive.cpp b/src/libkernel/sscp/host/scan_exclusive.cpp
new file mode 100644
index 000000000..0087cad16
--- /dev/null
+++ b/src/libkernel/sscp/host/scan_exclusive.cpp
@@ -0,0 +1,165 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#define ACPP_SSCP_OMP_LIBKERNEL
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::plus{},     \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::multiply{}, \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::min{},      \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::max{},      \
+                                                          shrd_mem, init);                         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::plus{},     \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::multiply{}, \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::min{},      \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::max{},      \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::bit_and{},  \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::bit_or{},   \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::bit_xor{},  \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(                                         \
+          x, hipsycl::libkernel::sscp::logical_and{}, shrd_mem, init);                             \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(                                         \
+          x, hipsycl::libkernel::sscp::logical_or{}, shrd_mem, init);                              \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/host/scan_inclusive.cpp b/src/libkernel/sscp/host/scan_inclusive.cpp
new file mode 100644
index 000000000..ef7445df1
--- /dev/null
+++ b/src/libkernel/sscp/host/scan_inclusive.cpp
@@ -0,0 +1,154 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#define ACPP_SSCP_OMP_LIBKERNEL
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::plus{},    \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::min{},     \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::max{},     \
+                                                           shrd_mem);                              \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::plus{},    \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::min{},     \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::max{},     \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::bit_and{}, \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::bit_or{},  \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::bit_xor{}, \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::logical_and{}, shrd_mem);                                   \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::logical_or{}, shrd_mem);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/host/shuffle.cpp b/src/libkernel/sscp/host/shuffle.cpp
new file mode 100644
index 000000000..606d86255
--- /dev/null
+++ b/src/libkernel/sscp/host/shuffle.cpp
@@ -0,0 +1,53 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+
+#define SUBGROUP_SIZE_ONE_SHUFLLE(int_size, direction)                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_int##int_size __acpp_sscp_sub_group_##direction##_i##int_size(__acpp_int##int_size value, \
+                                                                       __acpp_uint32 delta) {      \
+    return delta == 0 ? value : 0;                                                                 \
+  }
+
+SUBGROUP_SIZE_ONE_SHUFLLE(8, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(16, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(32, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(64, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(8, shr)
+SUBGROUP_SIZE_ONE_SHUFLLE(16, shr)
+SUBGROUP_SIZE_ONE_SHUFLLE(32, shr)
+SUBGROUP_SIZE_ONE_SHUFLLE(64, shr)
+
+#define SUBGROUP_SIZE_ONE_PERMUTE(int_size)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_int##int_size __acpp_sscp_sub_group_permute_i##int_size(__acpp_int##int_size value,       \
+                                                                 __acpp_int32 mask) {              \
+    return mask xor 0 ? value : 0;                                                                 \
+  }
+
+SUBGROUP_SIZE_ONE_PERMUTE(8)
+SUBGROUP_SIZE_ONE_PERMUTE(16)
+SUBGROUP_SIZE_ONE_PERMUTE(32)
+SUBGROUP_SIZE_ONE_PERMUTE(64)
+
+#define SUBGROUP_SIZE_ONE_SELECT(int_size)                                                         \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_int##int_size __acpp_sscp_sub_group_select_i##int_size(__acpp_int##int_size value,        \
+                                                                __acpp_int32 mask) {               \
+    return mask == 0 ? value : 0;                                                                  \
+  }
+
+SUBGROUP_SIZE_ONE_SELECT(8)
+SUBGROUP_SIZE_ONE_SELECT(16)
+SUBGROUP_SIZE_ONE_SELECT(32)
+SUBGROUP_SIZE_ONE_SELECT(64)
diff --git a/src/libkernel/sscp/ptx/CMakeLists.txt b/src/libkernel/sscp/ptx/CMakeLists.txt
index 063c0a507..a5c4367de 100644
--- a/src/libkernel/sscp/ptx/CMakeLists.txt
+++ b/src/libkernel/sscp/ptx/CMakeLists.txt
@@ -3,6 +3,23 @@ if(WITH_LLVM_TO_PTX)
   libkernel_generate_bitcode_target(
       TARGETNAME ptx 
       TRIPLE nvptx64-nvidia-cuda
-      SOURCES atomic.cpp barrier.cpp core.cpp half.cpp integer.cpp print.cpp relational.cpp math.cpp native.cpp localmem.cpp subgroup.cpp
+      SOURCES 
+      atomic.cpp
+      barrier.cpp
+      core.cpp
+      half.cpp
+      integer.cpp
+      print.cpp
+      relational.cpp
+      math.cpp
+      native.cpp
+      localmem.cpp
+      subgroup.cpp
+      shuffle.cpp
+      reduction.cpp
+      broadcast.cpp
+      scan_inclusive.cpp
+      scan_exclusive.cpp
+      collpredicate.cpp
       ADDITIONAL_ARGS -Xclang -target-feature -Xclang +sm_60)
 endif()
diff --git a/src/libkernel/sscp/ptx/broadcast.cpp b/src/libkernel/sscp/ptx/broadcast.cpp
new file mode 100644
index 000000000..85a687283
--- /dev/null
+++ b/src/libkernel/sscp/ptx/broadcast.cpp
@@ -0,0 +1,38 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+#define ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                                \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##input_type shrd_x[1];                                            \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, &shrd_x[0]);                          \
+  }
+
+ACPP_WORKGROUP_BCAST(i8,int8)
+ACPP_WORKGROUP_BCAST(i16,int16)
+ACPP_WORKGROUP_BCAST(i32,int32)
+ACPP_WORKGROUP_BCAST(i64,int64)
+
+ACPP_SUBGROUP_BCAST(i8,int8)
+ACPP_SUBGROUP_BCAST(i16,int16)
+ACPP_SUBGROUP_BCAST(i32,int32)
+ACPP_SUBGROUP_BCAST(i64,int64)
diff --git a/src/libkernel/sscp/ptx/collpredicate.cpp b/src/libkernel/sscp/ptx/collpredicate.cpp
new file mode 100644
index 000000000..b243639fb
--- /dev/null
+++ b/src/libkernel/sscp/ptx/collpredicate.cpp
@@ -0,0 +1,47 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/ptx/reduction.cpp b/src/libkernel/sscp/ptx/reduction.cpp
new file mode 100644
index 000000000..cab784123
--- /dev/null
+++ b/src/libkernel/sscp/ptx/reduction.cpp
@@ -0,0 +1,152 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/ptx/scan_exclusive.cpp b/src/libkernel/sscp/ptx/scan_exclusive.cpp
new file mode 100644
index 000000000..0950e8095
--- /dev/null
+++ b/src/libkernel/sscp/ptx/scan_exclusive.cpp
@@ -0,0 +1,166 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0], init);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0], init);                         \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0], init);                          \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/ptx/scan_inclusive.cpp b/src/libkernel/sscp/ptx/scan_inclusive.cpp
new file mode 100644
index 000000000..70f0e1248
--- /dev/null
+++ b/src/libkernel/sscp/ptx/scan_inclusive.cpp
@@ -0,0 +1,155 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/ptx/shuffle.cpp b/src/libkernel/sscp/ptx/shuffle.cpp
new file mode 100644
index 000000000..cf8ce45c6
--- /dev/null
+++ b/src/libkernel/sscp/ptx/shuffle.cpp
@@ -0,0 +1,121 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/subgroup.hpp"
+
+constexpr unsigned int FULL_MASK = 0xffffffff;
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shl_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shl_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  // __acpp_uint32 mask = get_active_mask();
+  return __nvvm_shfl_sync_down_i32(FULL_MASK, value, delta, 0x1f);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shl_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shl_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shl_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shr_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shr_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shr_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  // __acpp_uint32 mask = get_active_mask();
+  return __nvvm_shfl_sync_up_i32(FULL_MASK, value, delta, 0);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shr_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shr_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shr_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_permute_i8(__acpp_int8 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_permute_i16(__acpp_int16 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_permute_i32(__acpp_int32 value, __acpp_int32 mask) {
+  // __acpp_uint32 active_thread_mask = get_active_mask();
+  return __nvvm_shfl_sync_bfly_i32(FULL_MASK, value, mask, 0x1f);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_permute_i64(__acpp_int64 value, __acpp_int32 mask) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_permute_i32(tmp[0], mask);
+  tmp[1] = __acpp_sscp_sub_group_permute_i32(tmp[1], mask);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_select_i8(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_select_i16(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_select_i32(__acpp_int32 value, __acpp_int32 id) {
+  return __nvvm_shfl_sync_idx_i32(FULL_MASK, value, id, 31);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value, __acpp_int32 id) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_select_i32(tmp[0], id);
+  tmp[1] = __acpp_sscp_sub_group_select_i32(tmp[1], id);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
diff --git a/src/libkernel/sscp/spirv/CMakeLists.txt b/src/libkernel/sscp/spirv/CMakeLists.txt
index 57a189762..da863b6a1 100644
--- a/src/libkernel/sscp/spirv/CMakeLists.txt
+++ b/src/libkernel/sscp/spirv/CMakeLists.txt
@@ -3,5 +3,22 @@ if(WITH_LLVM_TO_SPIRV)
   libkernel_generate_bitcode_target(
       TARGETNAME spirv 
       TRIPLE spir64-unknown-unknown
-      SOURCES atomic.cpp barrier.cpp core.cpp half.cpp math.cpp native.cpp integer.cpp print.cpp relational.cpp localmem.cpp subgroup.cpp)
+      SOURCES 
+      atomic.cpp
+      barrier.cpp
+      core.cpp
+      half.cpp
+      math.cpp
+      native.cpp
+      integer.cpp
+      print.cpp
+      relational.cpp
+      localmem.cpp
+      subgroup.cpp
+      shuffle.cpp
+      reduction.cpp
+      broadcast.cpp
+      scan_inclusive.cpp
+      scan_exclusive.cpp
+      collpredicate.cpp)
 endif()
diff --git a/src/libkernel/sscp/spirv/broadcast.cpp b/src/libkernel/sscp/spirv/broadcast.cpp
new file mode 100644
index 000000000..d9c3380d1
--- /dev/null
+++ b/src/libkernel/sscp/spirv/broadcast.cpp
@@ -0,0 +1,38 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+#define ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                                \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##input_type shrd_x[1];                                            \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, &shrd_x[0]);                          \
+  }
+  
+ACPP_WORKGROUP_BCAST(i8,int8)
+ACPP_WORKGROUP_BCAST(i16,int16)
+ACPP_WORKGROUP_BCAST(i32,int32)
+ACPP_WORKGROUP_BCAST(i64,int64)
+
+ACPP_SUBGROUP_BCAST(i8,int8)
+ACPP_SUBGROUP_BCAST(i16,int16)
+ACPP_SUBGROUP_BCAST(i32,int32)
+ACPP_SUBGROUP_BCAST(i64,int64)
\ No newline at end of file
diff --git a/src/libkernel/sscp/spirv/collpredicate.cpp b/src/libkernel/sscp/spirv/collpredicate.cpp
new file mode 100644
index 000000000..b243639fb
--- /dev/null
+++ b/src/libkernel/sscp/spirv/collpredicate.cpp
@@ -0,0 +1,47 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/spirv/reduction.cpp b/src/libkernel/sscp/spirv/reduction.cpp
new file mode 100644
index 000000000..6ba3e85f9
--- /dev/null
+++ b/src/libkernel/sscp/spirv/reduction.cpp
@@ -0,0 +1,148 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/spirv/scan_exclusive.cpp b/src/libkernel/sscp/spirv/scan_exclusive.cpp
new file mode 100644
index 000000000..9d92ec25d
--- /dev/null
+++ b/src/libkernel/sscp/spirv/scan_exclusive.cpp
@@ -0,0 +1,162 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0], init);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0], init);                         \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0], init);                          \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/spirv/scan_inclusive.cpp b/src/libkernel/sscp/spirv/scan_inclusive.cpp
new file mode 100644
index 000000000..c1ce224af
--- /dev/null
+++ b/src/libkernel/sscp/spirv/scan_inclusive.cpp
@@ -0,0 +1,151 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/spirv/shuffle.cpp b/src/libkernel/sscp/spirv/shuffle.cpp
new file mode 100644
index 000000000..82a128595
--- /dev/null
+++ b/src/libkernel/sscp/spirv/shuffle.cpp
@@ -0,0 +1,134 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/subgroup.hpp"
+
+template <typename dataT>
+dataT __spirv_SubgroupShuffleINTEL(dataT Data, __acpp_uint32 InvocationId) noexcept;
+template <typename dataT>
+dataT __spirv_SubgroupShuffleDownINTEL(dataT Current, dataT Next, __acpp_uint32 Delta) noexcept;
+template <typename dataT>
+dataT __spirv_SubgroupShuffleUpINTEL(dataT Previous, dataT Current, __acpp_uint32 Delta) noexcept;
+template <typename dataT>
+dataT __spirv_SubgroupShuffleXorINTEL(dataT Data, __acpp_uint32 Value) noexcept;
+
+template <typename ValueT, typename IdT>
+ValueT __spirv_GroupNonUniformShuffle(__acpp_uint32, ValueT, IdT) noexcept;
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shl_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shl_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  __acpp_int32 local_id = __acpp_sscp_get_subgroup_local_id();
+  __acpp_int32 target_id = local_id + delta;
+  if (target_id >= __acpp_sscp_get_subgroup_size())
+    target_id = local_id;
+  return __spirv_GroupNonUniformShuffle(3, value, target_id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shl_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shl_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shl_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shr_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shr_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shr_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  __acpp_int32 local_id = __acpp_sscp_get_subgroup_local_id();
+  __acpp_int32 target_id = local_id;
+  if (local_id >= delta)
+    target_id -= delta;
+  return __spirv_GroupNonUniformShuffle(3, value, target_id);
+  // return __spirv_SubgroupShuffleDownINTEL(value, value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shr_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shr_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shr_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_permute_i8(__acpp_int8 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_permute_i16(__acpp_int16 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_permute_i32(__acpp_int32 value, __acpp_int32 mask) {
+  return __spirv_SubgroupShuffleXorINTEL(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_permute_i64(__acpp_int64 value, __acpp_int32 mask) {
+  __acpp_int32 local_id = __acpp_sscp_get_subgroup_local_id();
+  __acpp_int32 target_id = mask ^ local_id;
+  return __spirv_GroupNonUniformShuffle(3, value, target_id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_select_i8(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_select_i16(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_select_i32(__acpp_int32 value, __acpp_int32 id) {
+  return __builtin_bit_cast(__acpp_int32, __spirv_GroupNonUniformShuffle(3u, value, id));
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value, __acpp_int32 id) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_select_i32(tmp[0], id);
+  tmp[1] = __acpp_sscp_sub_group_select_i32(tmp[1], id);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}

From 477d3ed2db6fd9e3ece9f8b01a72812a5d1a9e08 Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Thu, 12 Dec 2024 02:46:42 +0100
Subject: [PATCH 121/126] Changes to tests to support variable sizes subgroups

Enable SSCP group tests by default
---
 .../sycl/group_functions/group_functions.hpp  | 17 ++---
 .../group_functions_binary_reduce.cpp         | 66 +++++++++----------
 .../group_functions/group_functions_misc.cpp  | 46 ++++++-------
 .../group_functions_reduce.cpp                | 21 +++---
 .../group_functions/group_functions_scan.cpp  | 34 +++++-----
 5 files changed, 84 insertions(+), 100 deletions(-)

diff --git a/tests/sycl/group_functions/group_functions.hpp b/tests/sycl/group_functions/group_functions.hpp
index 93412dfa2..78a662649 100644
--- a/tests/sycl/group_functions/group_functions.hpp
+++ b/tests/sycl/group_functions/group_functions.hpp
@@ -25,9 +25,8 @@
 
 using namespace cl;
 
-#ifndef __ACPP_ENABLE_LLVM_SSCP_TARGET__
 #define HIPSYCL_ENABLE_GROUP_ALGORITHM_TESTS
-#endif
+
 
 
 #ifdef TESTS_GROUPFUNCTION_FULL
@@ -142,7 +141,7 @@ T initialize_type(T init) {
 template<typename T, typename std::enable_if_t<!std::is_arithmetic_v<T>, int> = 0>
 ACPP_KERNEL_TARGET
 T initialize_type(elementType<T> init) {
-  constexpr size_t N = T::get_count();
+  constexpr size_t N = T::size();
 
   if constexpr (std::is_same_v<elementType<T>, bool>)
     return T{init};
@@ -221,7 +220,7 @@ inline void create_bool_test_data(std::vector<char> &buffer, size_t local_size,
 }
 
 template<typename T, int Line>
-void check_binary_reduce(std::vector<T> buffer, size_t local_size, size_t global_size,
+void check_binary_reduce(std::vector<T> buffer, std::vector<T> input, size_t local_size, size_t global_size,
                          std::vector<bool> expected, std::string name,
                          size_t break_size = 0, size_t offset = 0) {
   std::vector<std::string> cases{"everything except one false", "everything false",
@@ -268,6 +267,7 @@ void test_nd_group_function_1d(size_t elements_per_thread, DataGenerator dg,
   for (int i = 0; i < local_sizes.size(); ++i) {
     size_t local_size  = local_sizes[i];
     size_t global_size = global_sizes[i];
+    uint32_t used_sgrp_size = 0;
 
     std::vector<T> host_buf(elements_per_thread * global_size, T{});
 
@@ -277,11 +277,12 @@ void test_nd_group_function_1d(size_t elements_per_thread, DataGenerator dg,
 
     {
       sycl::buffer<T, 1> buf{host_buf.data(), host_buf.size()};
+      sycl::buffer<uint32_t, 1> used_sgrp_size_buffer(&used_sgrp_size, 1);
 
       queue.submit([&](sycl::handler &cgh) {
         using namespace sycl::access;
         auto acc = buf.template get_access<mode::read_write>(cgh);
-
+        auto sgpr_size_acc = used_sgrp_size_buffer.template get_access<mode::read_write>(cgh);
         cgh.parallel_for<class test_kernel<1, CallingLine, T>>(
           sycl::nd_range<1>{global_size, local_size},
           [=](sycl::nd_item<1> item) {
@@ -289,13 +290,13 @@ void test_nd_group_function_1d(size_t elements_per_thread, DataGenerator dg,
           auto sg = item.get_sub_group();
 
           T local_value = acc[item.get_global_linear_id()];
-
+          sgpr_size_acc[0] =  sg.get_max_local_range().size();
           f(acc, item.get_global_linear_id(), sg, g, local_value);
         });
       });
     }
 
-    vf(host_buf, original_host_buf, local_size, global_size);
+    vf(host_buf, original_host_buf,used_sgrp_size, local_size, global_size);
   }
 }
 
@@ -346,7 +347,7 @@ void test_nd_group_function_2d(size_t elements_per_thread, DataGenerator dg,
       });
     }
 
-    vf(host_buf, original_host_buf, local_size * local_size, global_size * global_size);
+    vf(host_buf, original_host_buf,0, local_size * local_size, global_size * global_size);
   }
 }
 
diff --git a/tests/sycl/group_functions/group_functions_binary_reduce.cpp b/tests/sycl/group_functions/group_functions_binary_reduce.cpp
index e8dfc56e9..992b1861e 100644
--- a/tests/sycl/group_functions/group_functions_binary_reduce.cpp
+++ b/tests/sycl/group_functions/group_functions_binary_reduce.cpp
@@ -35,9 +35,9 @@ BOOST_AUTO_TEST_CASE(group_x_of_local) {
       acc[global_linear_id] = sycl::any_of_group(g, static_cast<bool>(local_value));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
-      detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+      detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                std::vector<bool>{true, false, true, true},
                                                "any_of");
     };
@@ -55,10 +55,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_local) {
       acc[global_linear_id] = sycl::all_of_group(g, static_cast<bool>(local_value));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+          vIn, vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
           "all_of");
     };
 
@@ -75,10 +75,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_local) {
       acc[global_linear_id] = sycl::none_of_group(g, static_cast<bool>(local_value));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
           "none_of");
     };
 
@@ -95,7 +95,6 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
     using T = char;
 
     const size_t   elements_per_thread = 1;
-    const uint32_t subgroup_size = detail::get_subgroup_size(sycl::queue{});
 
     const auto data_generator = [](std::vector<T> &v, size_t local_size,
                                   size_t global_size) {
@@ -108,9 +107,9 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
         acc[global_linear_id] = sycl::any_of_group(sg, static_cast<bool>(local_value));
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+        detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                 std::vector<bool>{true, false, true, true},
                                                 "any_of", subgroup_size);
       };
@@ -125,11 +124,11 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
         acc[global_linear_id] = sycl::all_of_group(sg, static_cast<bool>(local_value));
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
-            "all_of", subgroup_size);
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
+            "all_of bool sub group", subgroup_size);
       };
 
       test_nd_group_function_1d<__LINE__, T>(elements_per_thread, data_generator,
@@ -142,10 +141,10 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
         acc[global_linear_id] = sycl::none_of_group(sg, static_cast<bool>(local_value));
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
             "none_of", subgroup_size);
       };
 
@@ -177,9 +176,9 @@ BOOST_AUTO_TEST_CASE(group_x_of_ptr_function) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
-      detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+      detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                std::vector<bool>{true, true, true, false},
                                                "any_of", 0, 2 * global_size);
     };
@@ -204,10 +203,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_ptr_function) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
           "all_of", 0, 2 * global_size);
     };
 
@@ -232,10 +231,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_ptr_function) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
           "none_of", 0, 2 * global_size);
     };
 
@@ -263,9 +262,9 @@ BOOST_AUTO_TEST_CASE(group_x_of_function) {
           sycl::any_of_group(g, static_cast<bool>(local_value), std::logical_not<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
-      detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+      detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                std::vector<bool>{true, true, true, false},
                                                "any_of");
     };
@@ -284,10 +283,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_function) {
           sycl::all_of_group(g, static_cast<bool>(local_value), std::logical_not<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
           "all_of");
     };
 
@@ -305,10 +304,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_function) {
           sycl::none_of_group(g, static_cast<bool>(local_value), std::logical_not<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
           "none_of");
     };
 
@@ -325,7 +324,6 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
     using T = char;
 
     const size_t   elements_per_thread = 1;
-    const uint32_t subgroup_size = detail::get_subgroup_size(sycl::queue{});
 
     const auto data_generator = [](std::vector<T> &v, size_t local_size,
                                   size_t global_size) {
@@ -339,9 +337,9 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
             sycl::any_of_group(sg, static_cast<bool>(local_value), std::logical_not<T>());
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+        detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                 std::vector<bool>{true, true, true, false},
                                                 "any_of", subgroup_size);
       };
@@ -357,10 +355,10 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
             sycl::all_of_group(sg, static_cast<bool>(local_value), std::logical_not<T>());
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
             "all_of", subgroup_size);
       };
 
@@ -375,10 +373,10 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
             sycl::none_of_group(sg, static_cast<bool>(local_value), std::logical_not<T>());
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
             "none_of", subgroup_size);
       };
 
@@ -389,4 +387,4 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
 }
 BOOST_AUTO_TEST_SUITE_END()
 
-#endif
+#endif
\ No newline at end of file
diff --git a/tests/sycl/group_functions/group_functions_misc.cpp b/tests/sycl/group_functions/group_functions_misc.cpp
index 65c7086e7..77b867f29 100644
--- a/tests/sycl/group_functions/group_functions_misc.cpp
+++ b/tests/sycl/group_functions/group_functions_misc.cpp
@@ -46,7 +46,7 @@ BOOST_AUTO_TEST_CASE(group_barrier) {
       }
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = (i % local_size) * 10000;
@@ -82,7 +82,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_broadcast, T, test_types) {
       acc[global_linear_id] = sycl::group_broadcast(g, local_value);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = detail::initialize_type<T>(((int)i / local_size) * local_size) +
@@ -112,7 +112,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_broadcast, T, test_types) {
       acc[global_linear_id] = sycl::group_broadcast(g, local_value, 10);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = detail::initialize_type<T>(((int)i / local_size) * local_size + 10) +
@@ -146,7 +146,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_broadcast, T, test_types) {
       acc[global_linear_id] = sycl::group_broadcast(g, local_value, sycl::id<2>(0, 10));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = detail::initialize_type<T>(((int)i / local_size) * local_size + 10) +
@@ -187,9 +187,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
         acc[global_linear_id] = sycl::group_broadcast(sg, local_value);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < vIn.size(); ++i) {
           int expected_base = i % local_size;
           expected_base = ((int)expected_base / subgroup_size) *
@@ -217,16 +216,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
     {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
-        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, 10);
+        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, 7);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < vIn.size(); ++i) {
           int expected_base = i % local_size;
           expected_base     = ((int)expected_base / subgroup_size) * subgroup_size;
-          expected_base += ((int)i / local_size) * local_size + 10;
+          expected_base += ((int)i / local_size) * local_size + 7;
 
           T expected = detail::initialize_type<T>(expected_base) +
                       detail::get_offset<T>(global_size);
@@ -249,16 +247,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
     {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
-        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, sycl::id<1>(10));
+        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, sycl::id<1>(7));
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < vIn.size(); ++i) {
           int expected_base = i % local_size;
           expected_base     = ((int)expected_base / subgroup_size) * subgroup_size;
-          expected_base += ((int)i / local_size) * local_size + 10;
+          expected_base += ((int)i / local_size) * local_size + 7;
 
           T expected = detail::initialize_type<T>(expected_base) +
                       detail::get_offset<T>(global_size);
@@ -294,7 +291,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
       acc[global_linear_id] = sycl::shift_group_left(g, local_value, 1);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -330,7 +327,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
       acc[global_linear_id] = sycl::shift_group_right(g, local_value, 1);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -366,7 +363,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
       acc[global_linear_id] = sycl::permute_group_by_xor(g, local_value, 1);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -403,7 +400,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
           sycl::select_from_group(g, local_value, sycl::id<g.dimensions>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -445,9 +442,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::shift_group_left(sg, local_value, 1);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
             for (size_t k = 0; k < subgroup_size; ++k) {
@@ -490,9 +486,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::shift_group_right(sg, local_value, 1);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
             for (size_t k = 0; k < subgroup_size; ++k) {
@@ -523,7 +518,6 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
           }
         }
       };
-
       test_nd_group_function_1d<__LINE__, T>(elements_per_thread, data_generator,
                                             tested_function, validation_function);
     }
@@ -534,9 +528,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::permute_group_by_xor(sg, local_value, 1);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
             for (size_t k = 0; k < subgroup_size; ++k) {
@@ -579,9 +572,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::select_from_group(sg, local_value, sycl::id<1>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-          auto subgroup_size = detail::get_subgroup_size();
           for (size_t i = 0; i < global_size / local_size; ++i) {
             for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
               for (size_t k = 0; k < subgroup_size; ++k) {
diff --git a/tests/sycl/group_functions/group_functions_reduce.cpp b/tests/sycl/group_functions/group_functions_reduce.cpp
index 3a4799ed2..d131c4475 100644
--- a/tests/sycl/group_functions/group_functions_reduce.cpp
+++ b/tests/sycl/group_functions/group_functions_reduce.cpp
@@ -30,7 +30,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce_mul, T, test_types) {
       acc[global_linear_id] = sycl::reduce_over_group(g, local_value, std::multiplies<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = vOrig[i * local_size];
@@ -74,7 +74,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce, T, test_types) {
       acc[global_linear_id] = sycl::reduce_over_group(g, local_value, std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = T{};
@@ -96,7 +96,6 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce, T, test_types) {
 
     test_nd_group_function_1d<__LINE__, T>(elements_per_thread, data_generator,
                                            tested_function, validation_function);
-
     test_nd_group_function_2d<__LINE__, T>(elements_per_thread, data_generator,
                                            tested_function, validation_function);
   }
@@ -108,7 +107,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = detail::initialize_type<T>(10);
@@ -158,7 +157,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce_ptr, T, test_types) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = T{};
@@ -197,7 +196,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce_ptr, T, test_types) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = detail::initialize_type<T>(10);
@@ -236,12 +235,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_reduce, T, test_types) {
     {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
-        acc[global_linear_id] = sycl::reduce_over_group(sg, local_value, std::plus<T>());
+        acc[global_linear_id] = sycl::reduce_over_group(sg, local_value, sycl::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           T    expected         = T{};
           auto actual_warp_size = local_size < subgroup_size ? local_size : subgroup_size;
@@ -267,12 +265,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_reduce, T, test_types) {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
         acc[global_linear_id] = sycl::reduce_over_group(
-            sg, local_value, detail::initialize_type<T>(10), std::plus<T>());
+            sg, local_value, detail::initialize_type<T>(10), sycl::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           T    expected         = detail::initialize_type<T>(10);
           auto actual_warp_size = local_size < subgroup_size ? local_size : subgroup_size;
diff --git a/tests/sycl/group_functions/group_functions_scan.cpp b/tests/sycl/group_functions/group_functions_scan.cpp
index 89e81b89d..04ec7a7c4 100644
--- a/tests/sycl/group_functions/group_functions_scan.cpp
+++ b/tests/sycl/group_functions/group_functions_scan.cpp
@@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_mul, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::multiplies<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -77,7 +77,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan, T, test_types) {
       acc[global_linear_id] = sycl::exclusive_scan_over_group(g, local_value, std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -167,7 +167,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_ptr, T, test_types) {
       sycl::joint_exclusive_scan(g, start.get(), end.get(), out.get(), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -211,7 +211,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_ptr, T, test_types) {
                                    detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -223,7 +223,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_ptr, T, test_types) {
 
         for (size_t j = i * 2 * local_size; j < (i + 1) * local_size * 2; ++j) {
           T computed = vIn[j + global_size * 2];
-          BOOST_TEST(detail::compare_type(expected[j], computed),
+          BOOST_TEST_REQUIRE(detail::compare_type(expected[j], computed),
                      detail::type_to_string(computed)
                          << " at position " << j << " instead of "
                          << detail::type_to_string(expected[j])
@@ -258,10 +258,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_exclusive_scan, T, test_types) {
         acc[global_linear_id] = sycl::exclusive_scan_over_group(sg, local_value, std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = T{};
           auto actual_warp_size    = local_size < subgroup_size ? local_size : subgroup_size;
@@ -294,10 +293,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_exclusive_scan, T, test_types) {
             sg, local_value, detail::initialize_type<T>(10), std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
 
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = detail::initialize_type<T>(10);
@@ -340,7 +338,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan_mul, T, test_types) {
           sycl::inclusive_scan_over_group(g, local_value, std::multiplies<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -386,7 +384,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan, T, test_types) {
       acc[global_linear_id] = sycl::inclusive_scan_over_group(g, local_value, std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -423,7 +421,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -475,7 +473,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan_ptr, T, test_types) {
       sycl::joint_inclusive_scan(g, start.get(), end.get(), out.get(), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -518,7 +516,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan_ptr, T, test_types) {
                                    std::plus<T>(), detail::initialize_type<T>(10));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -567,10 +565,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_inclusive_scan, T, test_types) {
         acc[global_linear_id] = sycl::inclusive_scan_over_group(sg, local_value, std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
 
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = vOrig[i * local_size];
@@ -603,10 +600,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_inclusive_scan, T, test_types) {
             sg, local_value, detail::initialize_type<T>(10), std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
 
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = vOrig[i * local_size] + detail::initialize_type<T>(10);

From 759f63d1b5036b6b514486d5fc308f973e294b24 Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Thu, 12 Dec 2024 15:33:30 +0100
Subject: [PATCH 122/126] Disable workgorup shuffle like tests

---
 tests/CMakeLists.txt                                | 4 ++++
 tests/sycl/group_functions/group_functions_misc.cpp | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0e2c0375b..f35961e62 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ project(adaptivecpp-tests)
 set(Boost_USE_STATIC_LIBS off)
 set(BUILD_SHARED_LIBS on)
 set(REDUCED_LOCAL_MEM_USAGE OFF CACHE BOOL "Only run tests with reduced local memory usage to allow running on hardware with little local memory.")
+set(ACPP_TEST_WORK_GROUP_SHUFFLE_EXT OFF CACHE BOOL "Enable work group shuffles tests that are an AdaptiveCpp extension.")
 
 find_package(Boost COMPONENTS unit_test_framework REQUIRED)
 
@@ -39,6 +40,9 @@ if(REDUCED_LOCAL_MEM_USAGE)
   add_definitions(-DREDUCED_LOCAL_MEM_USAGE)
 endif()
 
+if(ACPP_TEST_WORK_GROUP_SHUFFLE_EXT)
+  add_definitions(-DACPP_TEST_WORK_GROUP_SHUFFLE_EXT)
+endif()
 
 #Use add_definitions for now for older cmake versions
 cmake_policy(SET CMP0005 NEW)
diff --git a/tests/sycl/group_functions/group_functions_misc.cpp b/tests/sycl/group_functions/group_functions_misc.cpp
index 77b867f29..7d7ed0c69 100644
--- a/tests/sycl/group_functions/group_functions_misc.cpp
+++ b/tests/sycl/group_functions/group_functions_misc.cpp
@@ -277,7 +277,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
   }
 }
 
-#if !defined(REDUCED_LOCAL_MEM_USAGE)
+#if defined(ACPP_TEST_WORK_GROUP_SHUFFLE_EXT) and !defined(REDUCED_LOCAL_MEM_USAGE)
 BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
   const size_t elements_per_thread = 1;
   const auto   data_generator      = [](std::vector<T> &v, size_t local_size,

From 7823ccc8932013e08de8d4cbfdc8bc355304e146 Mon Sep 17 00:00:00 2001
From: Joachim Meyer <jmeyer@cs.uni-saarland.de>
Date: Mon, 16 Dec 2024 18:09:37 +0100
Subject: [PATCH 123/126] Cleanup higher dim local id globals.

---
 src/compiler/cbs/SubCfgFormation.cpp | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/compiler/cbs/SubCfgFormation.cpp b/src/compiler/cbs/SubCfgFormation.cpp
index 9b73472c8..5ab4d0a68 100644
--- a/src/compiler/cbs/SubCfgFormation.cpp
+++ b/src/compiler/cbs/SubCfgFormation.cpp
@@ -127,7 +127,8 @@ getLocalSizeArgumentFromAnnotation(llvm::Function &F) {
   for (auto &BB : F)
     for (auto &I : BB)
       if (auto *UI = llvm::dyn_cast<llvm::CallInst>(&I))
-        if (hipsycl::llvmutils::starts_with(UI->getCalledFunction()->getName(), "llvm.var.annotation")) {
+        if (hipsycl::llvmutils::starts_with(UI->getCalledFunction()->getName(),
+                                            "llvm.var.annotation")) {
           HIPSYCL_DEBUG_INFO << *UI << '\n';
           llvm::GlobalVariable *AnnotateStr = nullptr;
           if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(UI->getOperand(1));
@@ -142,8 +143,8 @@ getLocalSizeArgumentFromAnnotation(llvm::Function &F) {
             if (auto *Data =
                     llvm::dyn_cast<llvm::ConstantDataSequential>(AnnotateStr->getInitializer())) {
               if (Data->isString() &&
-		  hipsycl::llvmutils::starts_with(Data->getAsString(),
-						  "hipsycl_nd_kernel_local_size_arg")) {
+                  hipsycl::llvmutils::starts_with(Data->getAsString(),
+                                                  "hipsycl_nd_kernel_local_size_arg")) {
                 if (auto *BC = llvm::dyn_cast<llvm::BitCastInst>(UI->getOperand(0)))
                   return {BC->getOperand(0), UI};
                 return {UI->getOperand(0), UI};
@@ -351,6 +352,13 @@ void createLoopsAround(llvm::Function &F, llvm::BasicBlock *AfterBB,
 
   VMap[mergeGVLoadsInEntry(F, LocalIdGlobalNamesRotated[0])] = IndVars[0];
 
+  // in case code references all dimensions, we need to set the remaining dimensions to 0
+  for (size_t D = Dim; D < 3; ++D) {
+    auto ID = mergeGVLoadsInEntry(F, LocalIdGlobalNamesRotated[D]);
+    ID->replaceAllUsesWith(Builder.getIntN(Idx->getType()->getIntegerBitWidth(), 0));
+    ID->eraseFromParent();
+  }
+
   VMap[ContiguousIdx] = Idx;
   ContiguousIdx = Idx;
 }
@@ -1311,7 +1319,6 @@ void createLoopsAroundKernel(llvm::Function &F, llvm::DominatorTree &DT, llvm::L
 
   Body = Body->getSingleSuccessor();
 
-
   llvm::SmallVector<llvm::BasicBlock *, 4> ExitBBs;
   llvm::BasicBlock *ExitBB = llvm::BasicBlock::Create(F.getContext(), "exit", &F);
   llvm::IRBuilder<> Bld{ExitBB};
@@ -1352,10 +1359,13 @@ void createLoopsAroundKernel(llvm::Function &F, llvm::DominatorTree &DT, llvm::L
   llvm::remapInstructionsInBlocks(Blocks, VMap);
 
   // remove uses of the undefined global id variables
-  for (int D = 0; D < Dim; ++D)
+  for (int D = 0; D < 3; ++D)
     if (auto *Load =
-            llvm::cast_or_null<llvm::LoadInst>(getLoadForGlobalVariable(F, LocalIdGlobalNames[D])))
+            llvm::cast_or_null<llvm::LoadInst>(mergeGVLoadsInEntry(F, LocalIdGlobalNames[D]))) {
+      if (D >= Dim)
+        Load->replaceAllUsesWith(llvm::ConstantInt::get(Load->getType(), 0));
       Load->eraseFromParent();
+    }
   HIPSYCL_DEBUG_EXECUTE_VERBOSE(F.viewCFG())
 }
 

From 0894990c86558a8e6a0fc4dd7837b7ed6450e841 Mon Sep 17 00:00:00 2001
From: Joachim Meyer <jmeyer@cs.uni-saarland.de>
Date: Wed, 18 Dec 2024 11:14:43 +0100
Subject: [PATCH 124/126] Use correct local id name array.

---
 src/compiler/cbs/SubCfgFormation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/cbs/SubCfgFormation.cpp b/src/compiler/cbs/SubCfgFormation.cpp
index 5ab4d0a68..8b9e92212 100644
--- a/src/compiler/cbs/SubCfgFormation.cpp
+++ b/src/compiler/cbs/SubCfgFormation.cpp
@@ -354,7 +354,7 @@ void createLoopsAround(llvm::Function &F, llvm::BasicBlock *AfterBB,
 
   // in case code references all dimensions, we need to set the remaining dimensions to 0
   for (size_t D = Dim; D < 3; ++D) {
-    auto ID = mergeGVLoadsInEntry(F, LocalIdGlobalNamesRotated[D]);
+    auto ID = mergeGVLoadsInEntry(F, LocalIdGlobalNames[D]);
     ID->replaceAllUsesWith(Builder.getIntN(Idx->getType()->getIntegerBitWidth(), 0));
     ID->eraseFromParent();
   }

From c2b3ce166ccf53c6ef523e975b4df369943b3587 Mon Sep 17 00:00:00 2001
From: sbalint98 <sbsbalint14@gmail.com>
Date: Fri, 20 Dec 2024 21:54:31 +0100
Subject: [PATCH 125/126] use __clang_builtin_bitcast directly

---
 .../libkernel/sscp/builtins/detail/reduction.hpp   | 14 ++++++++------
 .../sscp/builtins/detail/scan_subgroup.hpp         |  8 ++++----
 .../sycl/libkernel/sscp/builtins/detail/utils.hpp  |  6 ------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
index bad77082c..8fa70e77f 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
@@ -28,13 +28,14 @@ OutType sg_reduce_impl(OutType x, BinaryOperation binary_op, __acpp_int32 active
   const __acpp_uint64 subgroup_size = active_threads;
   auto local_x = x;
   for (__acpp_int32 i = lrange / 2; i > 0; i /= 2) {
-    auto other_x = bit_cast<OutType>(sg_select(
-        bit_cast<typename integer_type<OutType>::type>(local_x), lid + i));
+    auto other_x = __builtin_bit_cast(
+        OutType,
+        sg_select(__builtin_bit_cast(typename integer_type<OutType>::type, local_x), lid + i));
     if (lid + i < subgroup_size)
       local_x = binary_op(local_x, other_x);
   }
-  return bit_cast<OutType>(
-      sg_select(bit_cast<typename integer_type<OutType>::type>(local_x), 0));
+  return __builtin_bit_cast(
+      OutType, sg_select(__builtin_bit_cast(typename integer_type<OutType>::type, local_x), 0));
 }
 } // namespace
 
@@ -97,8 +98,9 @@ OutType wg_reduce(OutType x, BinaryOperation op, MemoryType *shrd_mem) {
   // Do a final broadcast
   using internal_type = typename integer_type<OutType>::type;
   static_assert(sizeof(internal_type) == sizeof(OutType));
-  local_reduce_result = bit_cast<OutType>(
-      wg_broadcast(0, bit_cast<internal_type>(local_reduce_result), &shrd_mem[0]));
+  local_reduce_result = __builtin_bit_cast(
+      OutType,
+      wg_broadcast(0, __builtin_bit_cast(internal_type, local_reduce_result), &shrd_mem[0]));
   return local_reduce_result;
 }
 
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
index 60386c8df..5f79c49ed 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
@@ -28,8 +28,8 @@ T sg_inclusive_scan(T x, BinaryOperation binary_op) {
   auto local_x = x;
   for (__acpp_int32 i = 1; i < lrange; i *= 2) {
     __acpp_uint32 next_id = lid - i;
-    auto other_x = bit_cast<T>(
-        sg_shift_right(bit_cast<typename integer_type<T>::type>(local_x), i));
+    auto other_x = __builtin_bit_cast(
+        T, sg_shift_right(__builtin_bit_cast(typename integer_type<T>::type, local_x), i));
     if (next_id >= 0 && i <= lid)
       local_x = binary_op(local_x, other_x);
   }
@@ -42,8 +42,8 @@ T sg_exclusive_scan(T x, BinaryOperation binary_op, T init) {
   const __acpp_uint64 subgroup_size = __acpp_sscp_get_subgroup_max_size();
   x = lid == 0 ? binary_op(x, init) : x;
   auto result_inclusive = sg_inclusive_scan(x, binary_op);
-  auto result = bit_cast<T>(sg_shift_right(
-      bit_cast<typename integer_type<T>::type>(result_inclusive), 1));
+  auto result = __builtin_bit_cast(
+      T, sg_shift_right(__builtin_bit_cast(typename integer_type<T>::type, result_inclusive), 1));
   result = lid % subgroup_size == 0 ? init : result;
   return result;
 }
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
index 78178184b..d90c195c6 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
@@ -18,12 +18,6 @@
 
 namespace hipsycl::libkernel::sscp {
 
-template <class Tout, class Tin> Tout bit_cast(Tin x) {
-  Tout result;
-  result = __builtin_bit_cast(Tout, x);
-  return result;
-}
-
 struct plus {
   template <typename T> T operator()(T lhs, T rhs) { return lhs + rhs; }
 };

From 182294a730fc4e99a201e80c81cbf466e076c9c2 Mon Sep 17 00:00:00 2001
From: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
Date: Fri, 20 Dec 2024 23:32:52 +0100
Subject: [PATCH 126/126] Apply suggestions from code review

---
 .../llvm-to-backend/host/HostKernelWrapperPass.cpp  | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
index 9f87f9c7b..5f819b9a2 100644
--- a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
+++ b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
@@ -155,13 +155,12 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
   for (int I = 0; I < 3; ++I) {
     replaceUsesOfGVWith(*Wrapper, cbs::NumGroupsGlobalNames[I], NumGroups[I]);
     replaceUsesOfGVWith(*Wrapper, cbs::GroupIdGlobalNames[I], GroupIds[I]);
-    replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I], LocalSize[I]);
-  }
-
-  for (auto i = 0ul; i < 3ul; ++i) {
-    if (KnownWgSize.at(i) != 0)
-      utils::replaceUsesOfGVWith(F, cbs::LocalSizeGlobalNames.at(i),
-                                 llvm::ConstantInt::get(SizeT, KnownWgSize.at(i)), PassPrefix);
+    if (KnownWgSize[I] != 0) {
+      replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I],
+                                 llvm::ConstantInt::get(SizeT, KnownWgSize[I]));
+    } else {
+      replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I], LocalSize[I]);
+    }
   }
 
   replaceUsesOfGVWith(*Wrapper, cbs::SscpDynamicLocalMemoryPtrName, LocalMemPtr);