diff --git a/.github/workflows/linux-lit.yml b/.github/workflows/linux-lit.yml
index b091907da..73ba8b7f4 100644
--- a/.github/workflows/linux-lit.yml
+++ b/.github/workflows/linux-lit.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        clang: [14, 15, 16, 17]
+        clang: [14, 15, 16, 17, 18]
         os: [ubuntu-22.04]
     steps:
       - uses: actions/checkout@v4
@@ -71,7 +71,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        clang: [14, 15, 16, 17]
+        clang: [14, 15, 16, 17, 18]
         os: [ubuntu-22.04]
     steps:
       - uses: actions/checkout@v4
diff --git a/.github/workflows/linux-self-hosted.yml b/.github/workflows/linux-self-hosted.yml
index 0a777fdf2..5dcd30078 100644
--- a/.github/workflows/linux-self-hosted.yml
+++ b/.github/workflows/linux-self-hosted.yml
@@ -13,6 +13,9 @@ jobs:
         nvhpc_version: ['22.11']
     steps:
     - uses: actions/checkout@v4
+    - name: clear JIT cache
+      run: |
+        rm -rf ~/.acpp
     - name: build
       run : |
         mkdir build && cd build
@@ -60,6 +63,10 @@ jobs:
         echo "Running tests on CUDA..."
         cd ${GITHUB_WORKSPACE}/build/tests-cuda-emp
         ACPP_VISIBILITY_MASK="omp;cuda" ./sycl_tests
+    - name: run CUDA tests (SSCP)
+      run: |
+        cd ${GITHUB_WORKSPACE}/build/tests-sscp
+        ACPP_VISIBILITY_MASK="omp;cuda" ./sycl_tests
     - name: run PSTL CUDA tests (integrated multipass)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-cuda
@@ -80,7 +87,7 @@ jobs:
     - name: run CPU tests (SSCP)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
-        ACPP_VISIBILITY_MASK=omp LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/build/install/lib ./sycl_tests -t '!group_functions_tests/*' -t '!extension_tests/*' -t '!kernel_invocation_tests/hierarchical*'
+        ACPP_VISIBILITY_MASK=omp LD_LIBRARY_PATH=${GITHUB_WORKSPACE}/build/install/lib ./sycl_tests
     - name: run PSTL CPU tests (SSCP)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
@@ -95,6 +102,9 @@ jobs:
         cuda: ['11.0'] # Just to be able to build the backend for explicit multipass
     steps:
     - uses: actions/checkout@v4
+    - name: clear JIT cache
+      run: |
+        rm -rf ~/.acpp
     - name: build
       run : |
         mkdir build && cd build
@@ -129,6 +139,11 @@ jobs:
         echo "Running tests on AMD..."
         cd ${GITHUB_WORKSPACE}/build/tests-rocm-emp
         ACPP_VISIBILITY_MASK="omp;hip" ./sycl_tests
+    - name: run ROCm tests (SSCP)
+      run: |
+        echo "Running tests on AMD..."
+        cd ${GITHUB_WORKSPACE}/build/tests-sscp
+        ACPP_VISIBILITY_MASK="omp;hip" ./sycl_tests
     - name: run PSTL ROCm tests (SSCP)
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
@@ -142,25 +157,29 @@ jobs:
         clang_version: ['15']
     steps:
     - uses: actions/checkout@v4
+    - name: clear JIT cache
+      run: |
+        rm -rf ~/.acpp
     - name: build
       run : |
         mkdir build && cd build
         cmake -DCMAKE_CXX_COMPILER=/usr/bin/clang++-${{matrix.clang_version}} -DCLANG_EXECUTABLE_PATH=/usr/bin/clang++-${{matrix.clang_version}} -DLLVM_DIR=/usr/lib/llvm-${{matrix.clang_version}}/cmake -DWITH_LEVEL_ZERO_BACKEND=ON -DWITH_OPENCL_BACKEND=ON -DCMAKE_INSTALL_PREFIX=`pwd`/install ..
         make -j3 install
     - name: build generic SSCP tests
-      if: matrix.clang_version >= 14
       run: |
         mkdir ${GITHUB_WORKSPACE}/build/tests-sscp
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
         cmake -DACPP_TARGETS="generic" -DAdaptiveCpp_DIR=${GITHUB_WORKSPACE}/build/install/lib/cmake/AdaptiveCpp -DWITH_PSTL_TESTS=ON ${GITHUB_WORKSPACE}/tests
-        make pstl_tests -j3
+        make -j3
+    - name: run Intel tests (OpenCL)
+      run: |
+        cd ${GITHUB_WORKSPACE}/build/tests-sscp
+        LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/ ACPP_VISIBILITY_MASK="omp;ocl:Graphics.*" ./sycl_tests
     - name: run PSTL Intel tests (L0)
-      if: matrix.clang_version >= 14
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
         ACPP_VISIBILITY_MASK="omp;ze" ./pstl_tests
     - name: run PSTL Intel tests (OpenCL)
-      if: matrix.clang_version >= 14
       run: |
         cd ${GITHUB_WORKSPACE}/build/tests-sscp
         LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/ ACPP_VISIBILITY_MASK="omp;ocl:Graphics.*" ./pstl_tests
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index f0da74ceb..edd18a837 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        clang: [15, 16]
+        clang: [15, 16, 17, 18]
         os: [ubuntu-22.04]
         cuda: [11.0.2]
         rocm: [5.4.3]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b45a621d..3a156319d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,11 +1,17 @@
-cmake_minimum_required (VERSION 3.9)
-if(NOT CMAKE_VERSION VERSION_LESS 3.12)
+cmake_minimum_required(VERSION 3.10)
+
+if(POLICY CMP0074) # Since CMake 3.12
   cmake_policy(SET CMP0074 NEW) # Don't complain about using BOOST_ROOT...
 endif()
+if(POLICY CMP0167) # Since CMake 3.30
+    # Suppress warning: "Policy CMP0167 is not set: The FindBoost module is removed."
+    # Set CMP0167 to OLD to avoid breakage in the Windows & macOS CI tests.
+    cmake_policy(SET CMP0167 OLD)
+endif()
 
 
 set(ACPP_VERSION_MAJOR 24)
-set(ACPP_VERSION_MINOR 06)
+set(ACPP_VERSION_MINOR 10)
 set(ACPP_VERSION_PATCH 0)
 
 execute_process(
@@ -52,16 +58,16 @@ if(NOT ACPP_VERSION_SUFFIX)
       RESULT_VARIABLE GIT_LOCAL_CHANGES_RETURN_CODE
       OUTPUT_STRIP_TRAILING_WHITESPACE
     )
-  
-    if (GIT_HASH_RETURN_CODE EQUAL 0 AND GIT_BRANCH_RETURN_CODE EQUAL 0 AND 
+
+    if (GIT_HASH_RETURN_CODE EQUAL 0 AND GIT_BRANCH_RETURN_CODE EQUAL 0 AND
         GIT_DATE_RETURN_CODE EQUAL 0 AND GIT_LOCAL_CHANGES_RETURN_CODE EQUAL 0)
-    
+
       if(NOT "${ACPP_LOCAL_CHANGES}" STREQUAL "")
         set(DIRTY_STR ".dirty")
       else()
         set(DIRTY_STR "")
       endif()
-  
+
       set(ACPP_VERSION_SUFFIX "+git.${ACPP_GIT_COMMIT_HASH}.${ACPP_GIT_DATE}.branch.${ACPP_GIT_BRANCH}${DIRTY_STR}")
     endif()
   endif()
@@ -148,13 +154,13 @@ elseif(ACPP_COMPILER_FEATURE_PROFILE STREQUAL "custom-deprecated")
   if(DEFINED WITH_ACCELERATED_CPU)
     set(ACPP_CUSTOM_PROFILE_WITH_ACCELERATED_CPU ${WITH_ACCELERATED_CPU} CACHE INTERNAL "(deprecated) custom profile")
   endif()
-  
+
   # Ensure that these argument do not enter cache, otherwise the if(defined) checks above
   # will no longer work.
   unset(WITH_SSCP_COMPILER CACHE)
   unset(WITH_STDPAR_COMPILER CACHE)
   unset(WITH_ACCELERATED_CPU CACHE)
-  
+
   set(WITH_SSCP_COMPILER ${ACPP_CUSTOM_PROFILE_WITH_SSCP_COMPILER})
   set(WITH_STDPAR_COMPILER ${ACPP_CUSTOM_PROFILE_WITH_STDPAR_COMPILER})
   set(WITH_ACCELERATED_CPU ${ACPP_CUSTOM_PROFILE_WITH_ACCELERATED_CPU})
@@ -200,7 +206,7 @@ endif()
 if(WITH_ROCM_BACKEND)
   if(NOT ROCM_FOUND)
     #  message(SEND_ERROR "hipcc was not found")
-  
+
     # User has requested ROCm, but we could not find hipcc.
     # this is not necessarily a reason to abort,
     # since we only need libhip_hcc, the HIP includes,
@@ -276,7 +282,7 @@ if(BUILD_CLANG_PLUGIN)
   if(LLVM_DIR_OLD AND NOT ("${LLVM_DIR_OLD}" STREQUAL "${LLVM_DIR}"))
     message(WARNING "Could not find LLVM in the requested location LLVM_DIR=${LLVM_DIR_OLD}; using ${LLVM_DIR}.")
   endif()
-  message(STATUS "Building hipSYCL against LLVM configured from ${LLVM_DIR}")
+  message(STATUS "Building AdaptiveCpp against LLVM configured from ${LLVM_DIR}")
   #find_package(Clang REQUIRED)
 
   find_program(CLANG_EXECUTABLE_PATH NAMES clang++-${LLVM_VERSION_MAJOR} clang++-${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR} clang++ HINTS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH CACHE STRING)
@@ -298,7 +304,7 @@ if(BUILD_CLANG_PLUGIN)
   get_filename_component(LLVM_BIN_DIR "${CLANG_EXECUTABLE_PATH}" DIRECTORY)
   get_filename_component(LLVM_PREFIX_DIR "${LLVM_BIN_DIR}" DIRECTORY)
   # The path to the internal clang includes is currently required on ROCm
-  # to let acpp fix a wrong order of system includes (clang's internal 
+  # to let acpp fix a wrong order of system includes (clang's internal
   # includes are not of high enough priority in the include path search order).
   # We identify this path as the one containing __clang_cuda_runtime_wrapper.h,
   # which is a clang-specific header file.
@@ -318,9 +324,9 @@ if(BUILD_CLANG_PLUGIN)
     # Required for newer ROCm versions
     set(CLANG_INCLUDE_PATH ${FOUND_CLANG_INCLUDE_PATH}/..)
   endif()
-  
+
   if(NOT EXISTS ${CLANG_INCLUDE_PATH})
-    message(SEND_ERROR "clang include path ${CLANG_INCLUDE_PATH} does not exist. Please provide clang's internal include path manually: Find the directory where __clang_cuda_runtime_wrapper.h is. Provide this directory for older ROCm versions and the parent directory for newer ones.")
+    message(SEND_ERROR "CLANG_INCLUDE_PATH ${CLANG_INCLUDE_PATH} does not exist. Please provide clang's internal include path manually: Find the directory where __clang_cuda_runtime_wrapper.h is. Provide this directory for older ROCm versions and the parent directory for newer ones.")
   endif()
   if(WITH_ROCM_BACKEND)
     execute_process(COMMAND ${CLANG_EXECUTABLE_PATH} "--version"
@@ -334,7 +340,7 @@ if(BUILD_CLANG_PLUGIN)
       message(STATUS "AMD clang version: ${ROCM_VERSION_MAJOR}.${ROCM_VERSION_MINOR}.${ROCM_VERSION_PATCH}")
     endif()
   endif()
-  
+
   if(${LLVM_VERSION_MAJOR} LESS 14)
     if(${WITH_ACCELERATED_CPU} OR ${WITH_SSCP_COMPILER} OR ${WITH_STDPAR_COMPILER})
       message(WARNING "clang version too old (${LLVM_VERSION_MAJOR} < 14) to be used with advanced AdaptiveCpp compiler features, disabling WITH_STDPAR_COMPILER, WITH_ACCELERATED_CPU, WITH_SSCP_COMPILER")
@@ -345,8 +351,8 @@ if(BUILD_CLANG_PLUGIN)
     endif()
   endif()
   message(STATUS "Using clang include directory: ${CLANG_INCLUDE_PATH}")
-  
-# Check if building on Windows and LLVM_LIBS is set, if not, use LLVM_AVAILABLE_LIBS 
+
+# Check if building on Windows and LLVM_LIBS is set, if not, use LLVM_AVAILABLE_LIBS
   if(WIN32 AND NOT LLVM_LIBS AND LLVM_AVAILABLE_LIBS)
     llvm_map_components_to_libnames(LLVM_LIBS analysis core support passes)
   endif()
@@ -362,7 +368,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
 set(ACPP_CONFIG_FILE_PATH "${PROJECT_BINARY_DIR}")
-set(ACPP_CONFIG_FILE_GLOBAL_INSTALLATION false CACHE BOOL 
+set(ACPP_CONFIG_FILE_GLOBAL_INSTALLATION false CACHE BOOL
   "Whether to install the AdaptiveCpp configuration files into a global directory (typically, /etc/AdaptiveCpp). This is generally not recommended.")
 
 if(ACPP_CONFIG_FILE_GLOBAL_INSTALLATION)
@@ -386,7 +392,7 @@ endif()
 
 if(APPLE)
   set(DEFAULT_OMP_FLAG "-Xclang -fopenmp")
-  
+
   if(Boost_FIBER_LIBRARY_DEBUG)
     set(DEFAULT_BOOST_LIBRARIES "${Boost_CONTEXT_LIBRARY_DEBUG} ${Boost_FIBER_LIBRARY_DEBUG} -Wl,-rpath ${Boost_LIBRARY_DIR}")
   else()
@@ -451,7 +457,7 @@ if(WIN32)
   if(NOT OMP_LINK_LINE)
     set(OMP_LINK_LINE ${DEFAULT_WIN32_OMP_LINK_LINE} CACHE STRING "Arguments passed to compiler to link OpenMP libraries to SYCL applications")
   endif()
-  if(NOT SEQUENTIAL_LINK_LINE) 
+  if(NOT SEQUENTIAL_LINK_LINE)
     set(SEQUENTIAL_LINK_LINE ${DEFAULT_WIN32_SEQUENTIAL_LINK_LINE} CACHE STRING "Arguments passed to compiler to link host libraries to SYCL applications")
   endif()
 elseif(APPLE)
@@ -461,7 +467,7 @@ elseif(APPLE)
   if(NOT OMP_LINK_LINE)
     set(OMP_LINK_LINE ${DEFAULT_APPLE_OMP_LINK_LINE} CACHE STRING "Arguments passed to compiler to link OpenMP libraries to SYCL applications")
   endif()
-  if(NOT SEQUENTIAL_LINK_LINE) 
+  if(NOT SEQUENTIAL_LINK_LINE)
     set(SEQUENTIAL_LINK_LINE ${DEFAULT_APPLE_SEQUENTIAL_LINK_LINE} CACHE STRING "Arguments passed to compiler to link host libraries to SYCL applications")
   endif()
 else()
@@ -471,33 +477,33 @@ else()
   if(NOT OMP_LINK_LINE)
     set(OMP_LINK_LINE ${DEFAULT_OMP_LINK_LINE} CACHE STRING "Arguments passed to compiler to link OpenMP libraries to SYCL applications")
   endif()
-  if(NOT SEQUENTIAL_LINK_LINE) 
+  if(NOT SEQUENTIAL_LINK_LINE)
     set(SEQUENTIAL_LINK_LINE ${DEFAULT_SEQUENTIAL_LINK_LINE} CACHE STRING "Arguments passed to compiler to link host libraries to SYCL applications")
   endif()
 endif()
 
 # If no compile flags given, set to default.
 if(NOT ROCM_CXX_FLAGS)
-  # clang erroneously sets feature detection flags for 
+  # clang erroneously sets feature detection flags for
   # __float128 even though it is not supported for CUDA / HIP,
   # see https://bugs.llvm.org/show_bug.cgi?id=47559.
 
   set(ROCM_CXX_FLAGS "-isystem $HIPSYCL_PATH/include/AdaptiveCpp/hipSYCL/std/hiplike -isystem ${CLANG_INCLUDE_PATH} -U__FLOAT128__ -U__SIZEOF_FLOAT128__ -I$HIPSYCL_ROCM_PATH/include -I$HIPSYCL_ROCM_PATH/include --rocm-device-lib-path=$HIPSYCL_ROCM_PATH/amdgcn/bitcode --rocm-path=$HIPSYCL_ROCM_PATH -fhip-new-launch-api -mllvm -amdgpu-early-inline-all=true -mllvm -amdgpu-function-calls=false -D__HIP_ROCclr__" CACHE STRING "Arguments passed to compiler to compile SYCL applications with ROCm")
 endif()
 
-if(NOT CUDA_CXX_FLAGS)	
-  # clang erroneously sets feature detection flags for 
+if(NOT CUDA_CXX_FLAGS)
+  # clang erroneously sets feature detection flags for
   # __float128 even though it is not supported for CUDA / HIP,
   # see https://bugs.llvm.org/show_bug.cgi?id=47559.
   set(CUDA_CXX_FLAGS "-U__FLOAT128__ -U__SIZEOF_FLOAT128__ -isystem $HIPSYCL_PATH/include/AdaptiveCpp/hipSYCL/std/hiplike" CACHE STRING "Arguments passed to compiler to compile SYCL applications with CUDA")
 endif()
 
 # always need -D_ENABLE_EXTENDED_ALIGNED_STORAGE to allow correctly aligned local memory on CPU
-if(NOT OMP_CXX_FLAGS) 
+if(NOT OMP_CXX_FLAGS)
   set(OMP_CXX_FLAGS "-I${Boost_INCLUDE_DIR} ${DEFAULT_OMP_FLAG} -D_ENABLE_EXTENDED_ALIGNED_STORAGE" CACHE STRING "Arguments passed to compiler to compile SYCL applications with OpenMP")
 endif()
 
-if(NOT SEQUENTIAL_CXX_FLAGS) 
+if(NOT SEQUENTIAL_CXX_FLAGS)
   set(SEQUENTIAL_CXX_FLAGS "-I${Boost_INCLUDE_DIR} -D_ENABLE_EXTENDED_ALIGNED_STORAGE" CACHE STRING "Arguments passed to compiler to compile SYCL applications on host")
 endif()
 
@@ -511,13 +517,13 @@ set(DEFAULT_GPU_ARCH "" CACHE STRING "(Deprecated, use DEFAULT_TARGETS instead)
 set(DEFAULT_TARGETS "" CACHE STRING "Default targets to compile for")
 
 if(NOT DEFAULT_TARGETS)
-  if(DEFAULT_PLATFORM)  
+  if(DEFAULT_PLATFORM)
     message(DEPRECATION "DEFAULT_PLATFORM is deprecated; use DEFAULT_TARGETS instead.")
-    
+
     if(DEFAULT_PLATFORM STREQUAL "cpu")
       set(DEFAULT_TARGETS "omp")
     endif()
-    
+
     if(DEFAULT_GPU_ARCH)
       message(DEPRECATION "DEFAULT_GPU_ARCH is deprecated; use DEFAULT_TARGETS instead.")
 
@@ -528,7 +534,7 @@ if(NOT DEFAULT_TARGETS)
       else()
         message(SEND_ERROR "Invalid value for DEFAULT_PLATFORM: \"${DEFAULT_PLATFORM}\". When DEFAULT_GPU_ARCH is specified, only \"cuda\" and \"rocm\" are supported.")
       endif()
-      
+
     endif()
   elseif(DEFAULT_GPU_ARCH)
     message(DEPRECATION "DEFAULT_GPU_ARCH is deprecated; use DEFAULT_TARGETS instead.")
@@ -607,11 +613,19 @@ configure_file(
 
 install(DIRECTORY include/CL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.hpp")
 install(DIRECTORY include/SYCL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.hpp")
+
 install(DIRECTORY include/hipSYCL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.hpp")
 install(DIRECTORY include/hipSYCL DESTINATION include/AdaptiveCpp/ FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY include/hipSYCL/std DESTINATION include/AdaptiveCpp/hipSYCL/ )
 install(FILES ${PROJECT_BINARY_DIR}/include/hipSYCL/common/config.hpp DESTINATION include/AdaptiveCpp/hipSYCL/common/)
 
+# This part of the installation process can be simplified once the source directory has been
+# renamed from hipSYCL to AdaptiveCpp.
+install(DIRECTORY include/hipSYCL/ DESTINATION include/AdaptiveCpp/AdaptiveCpp FILES_MATCHING PATTERN "*.hpp")
+install(DIRECTORY include/hipSYCL/ DESTINATION include/AdaptiveCpp/AdaptiveCpp FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY include/hipSYCL/std/ DESTINATION include/AdaptiveCpp/AdaptiveCpp/std )
+install(FILES ${PROJECT_BINARY_DIR}/include/hipSYCL/common/config.hpp DESTINATION include/AdaptiveCpp/AdaptiveCpp/common/)
+
 if(NOT WIN32)
 # Windows is case-insensitive, so don't copy to sycl/sycl.hpp as
 # we already have SYCL/sycl.hpp
diff --git a/README.md b/README.md
index 9b2168c6a..e7c45c43d 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ In order to compile software with AdaptiveCpp, use `acpp`. `acpp` can be used li
 
 `acpp` accepts both command line arguments and environment variables to configure its behavior (e.g., to select the target to compile for). See `acpp --help` for a comprehensive list of options.
 
-For details and instructions on using AdaptiveCpp in CMake projects, please see the documentation on [using AdaptiveCpp](doc/using-hipsycl.md).
+For details and instructions on using AdaptiveCpp in CMake projects, please see the documentation on [using AdaptiveCpp](doc/using-acpp.md).
 
 
 ## About the project
@@ -77,6 +77,7 @@ We gratefully acknowledge [contributions](https://github.com/illuhad/hipSYCL/gra
 * AdaptiveCpp [design and architecture](doc/architecture.md)
 * AdaptiveCpp runtime [specification](doc/runtime-spec.md)
 * AdaptiveCpp [compilation model](doc/compilation.md)
+* AdaptiveCpp [parallel algorithms library](doc/algorithms.md)
 * How to use raw HIP/CUDA inside AdaptiveCpp code to create [optimized code paths](doc/hip-source-interop.md)
 * A simple SYCL example code for testing purposes can be found [here](doc/examples.md).
 * [SYCL Extensions implemented in AdaptiveCpp](doc/extensions.md)
diff --git a/bin/acpp b/bin/acpp
index 236834883..f99888506 100755
--- a/bin/acpp
+++ b/bin/acpp
@@ -44,7 +44,7 @@ class hcf_node:
   @property
   def subnodes(self):
     return self._subnodes
-  
+
   def make_subnode(self, name):
     n = hcf_node(name, self._nesting_level+1)
     self._subnodes.append(n)
@@ -58,7 +58,7 @@ class hcf_node:
   @property
   def values(self):
     return self._key_value_pairs
-  
+
   @property
   def name(self):
     return self._node_name
@@ -69,12 +69,12 @@ class hcf_node:
 
     for k in self._key_value_pairs:
       result += "{}{}={}\n".format(indent,k, self._key_value_pairs[k])
-    
+
     for n in self._subnodes:
       result += indent + "{." + n.name + "\n"
       result += str(n)
       result += indent + "}." + n.name + "\n"
-    
+
     return result
 
 class hcf_generator:
@@ -102,7 +102,7 @@ class hcf_generator:
   # Return non-binary readable part
   def __str__(self) -> str:
     return str(self._root) + "__acpp_hcf_binary_appendix"
-  
+
   @property
   def bytes(self):
     result = str(self).encode("utf-8")
@@ -113,7 +113,7 @@ class hcf_generator:
   @property
   def escaped_bytes(self):
     hex = binascii.hexlify(self.bytes).decode("utf-8")
-    return ",".join(["0x" + hex[i:i+2] 
+    return ",".join(["0x" + hex[i:i+2]
         for i in range(0,len(hex),2)])
 
 
@@ -124,14 +124,14 @@ class integration_header:
     self._hcf.root.values["object-id"] = self._object_id
     self._hcf.root.values["generator"] = "syclcc"
     self._backend = backend_name
-  
+
   @property
   def hcf_object(self):
     return self._hcf
-  
+
   def __str__(self) -> str:
     hcf_string = self._hcf.escaped_bytes
-    
+
     header = """
 #ifndef ACPP_{capital_name}_INTEGRATION_HEADER
 #define ACPP_{capital_name}_INTEGRATION_HEADER
@@ -142,12 +142,12 @@ ACPP_STATIC_HCF_REGISTRATION({hcf_object_id}ull, __acpp_hcf_object_{hcf_object_i
 
 #endif
 """.format(
-      capital_name = self._backend.upper(), 
+      capital_name = self._backend.upper(),
       name = self._backend.lower(),
       hcf_object_id = self._object_id,
       hcf_size = len(self._hcf.bytes),
       hcf_binary = hcf_string)
-    
+
     return header
 
   def write_header(self, filename):
@@ -157,7 +157,7 @@ ACPP_STATIC_HCF_REGISTRATION({hcf_object_id}ull, __acpp_hcf_object_{hcf_object_i
 class config_db:
   # Scans the provided directory for json files
   def __init__(self, config_file_dirs):
-    
+
     self._data = {}
     self._locations = {}
     self._config_dirs = config_file_dirs
@@ -236,15 +236,15 @@ class acpp_config:
     # 3.) the field in the config file.
     self._options = {
       'platform': option("--acpp-platform", "ACPP_PLATFORM", "default-platform",
-"""  (deprecated) The platform that hipSYCL should target. Valid values:
+"""  (deprecated) The platform that AdaptiveCpp should target. Valid values:
     * cuda: Target NVIDIA CUDA GPUs
     * rocm: Target AMD GPUs running on the ROCm platform
     * cpu: Target only CPUs"""),
 
       'clang': option("--acpp-clang", "ACPP_CLANG", "default-clang",
 """  The path to the clang executable that should be used for compilation
-    (Note: *must* be compatible with the clang version that the 
-     hipSYCL clang plugin was compiled against!)"""),
+    (Note: *must* be compatible with the clang version that the
+     AdaptiveCpp clang plugin was compiled against!)"""),
 
       'nvcxx': option("--acpp-nvcxx", "ACPP_NVCXX", "default-nvcxx",
 """  The path to the nvc++ executable that should be used for compilation
@@ -263,7 +263,7 @@ class acpp_config:
 
       'cpu-compiler': option("--acpp-cpu-cxx", "ACPP_CPU_CXX", "default-cpu-cxx",
 """  The compiler that should be used when targeting only CPUs."""),
-      
+
       'clang-include-path' : option("--acpp-clang-include-path", "ACPP_CLANG_INCLUDE_PATH", "default-clang-include-path",
 """  The path to clang's internal include headers. Typically of the form $PREFIX/include/clang/<version>/include. Only required by ROCm."""),
 
@@ -294,7 +294,7 @@ class acpp_config:
       'config-file-dir' : option("--acpp-config-file-dir", "ACPP_CONFIG_FILE_DIR", "default-config-file-dir",
 """  Select an alternative path for the config files containing the default AdaptiveCpp settings.
     It is normally not necessary for the user to change this setting. """),
-    
+
       'targets': option("--acpp-targets", "ACPP_TARGETS", "default-targets",
 """  Specify backends and targets to compile for. Example: --acpp-targets='omp;hip:gfx900,gfx906'
     Available backends:
@@ -304,11 +304,11 @@ class acpp_config:
                                    Uses Boost.Fiber for nd_range parallel_for support.
                - omp.accelerated: Uses clang as host compiler to enable compiler support
                                   for nd_range parallel_for (see --acpp-use-accelerated-cpu).
-      * cuda - CUDA backend 
+      * cuda - CUDA backend
                Requires specification of targets of the form sm_XY,
                e.g. sm_70 for Volta, sm_60 for Pascal
                Backend Flavors:
-               - cuda.explicit-multipass: CUDA backend in explicit multipass mode 
+               - cuda.explicit-multipass: CUDA backend in explicit multipass mode
                                           (see --acpp-explicit-multipass)
                - cuda.integrated-multipass: Force CUDA backend to operate in integrated
                                            multipass mode.
@@ -324,7 +324,7 @@ class acpp_config:
                                            multipass mode.
       * generic - use generic LLVM SSCP compilation flow, and JIT at runtime to target device"""),
 
-      'stdpar-prefetch-mode' : option("--acpp-stdpar-prefetch-mode", "ACPP_STDPAR_PREFETCH_MODE", "default-stdpar-prefetch-mode", 
+      'stdpar-prefetch-mode' : option("--acpp-stdpar-prefetch-mode", "ACPP_STDPAR_PREFETCH_MODE", "default-stdpar-prefetch-mode",
 """  AdaptiveCpp supports issuing automatic USM prefetch operations for allocations used inside offloaded C++ PSTL
     algorithms. This flags determines the strategy for submitting such prefetches.
     Supported values are:
@@ -344,8 +344,11 @@ class acpp_config:
   of a work-group in a single thread, eliminating scheduling overhead
   and enabling enhanced vectorization opportunities compared to the fiber variant."""),
       'is-dryrun': option("--acpp-dryrun", "ACPP_DRYRUN", "default-is-dryrun",
-"""  If set, only shows compilation commands that would be executed, 
+"""  If set, only shows compilation commands that would be executed,
   but does not actually execute it. """),
+      'is-dryrun-only-std-flags': option("--acpp-dryrun-only-std-flags", "ACPP_DRYRUN_ONLYSTDFLAGS", "default-is-dryrun-only-std-flags",
+"""  If set, only shows compilation commands that would be executed, 
+  but does not actually execute it. This version also remove all non standard flags."""),
       'is-explicit-multipass': option("--acpp-explicit-multipass", "ACPP_EXPLICIT_MULTIPASS",
       "default-is-explicit-multipass",
 """  If set, executes device passes as separate compiler invocation and lets AdaptiveCpp control embedding device
@@ -354,16 +357,19 @@ class acpp_config:
   For example, you cannot use the CUDA kernel launch syntax[i.e. kernel <<< ... >>> (...)] in this mode. """),
       'should-save-temps': option("--acpp-save-temps", "ACPP_SAVE_TEMPS", "default-save-temps",
 """  If set, do not delete temporary files created during compilation."""),
-      'stdpar' : option("--acpp-stdpar", "ACPP_STDPAR", "default-is-stdpar", 
+      'stdpar' : option("--acpp-stdpar", "ACPP_STDPAR", "default-is-stdpar",
 """  If set, enables SYCL offloading of C++ standard parallel algorithms."""),
-      'stdpar-system-usm' : option("--acpp-stdpar-system-usm", "ACPP_STDPAR_SYSTEM_USM", "default-is-stdpar-system-usm", 
+      'stdpar-system-usm' : option("--acpp-stdpar-system-usm", "ACPP_STDPAR_SYSTEM_USM", "default-is-stdpar-system-usm",
 """  If set, assume availability of system-level unified shared memory where every pointer from regular
   malloc() is accessible on GPU. This disables automatic hijacking of memory allocations at the compiler
   level by AdaptiveCpp."""),
-      'stdpar-unconditional-offload' : option("--acpp-stdpar-unconditional-offload", "ACPP_STDPAR_UNCONDITIONAL_OFFLOAD", "default-is-stdpar-unconditional-offload", 
+      'stdpar-unconditional-offload' : option("--acpp-stdpar-unconditional-offload", "ACPP_STDPAR_UNCONDITIONAL_OFFLOAD", "default-is-stdpar-unconditional-offload",
 """  Normally, heuristics are employed to determine whether algorithms should be offloaded.
   This particularly affects small problem sizes. If this flag is set, supported parallel STL
-  algorithms will be offloaded unconditionally.""")
+  algorithms will be offloaded unconditionally."""),
+      'is-export-all' : option("--acpp-export-all", "ACPP_EXPORT_ALL", "default-export-all",
+"""  (Experimental) Treat all functions implicitly as SYCL_EXTERNAL. Only supported with generic target.
+  This currently only works with translation units that include the sycl.hpp header.""")
     }
 
 
@@ -375,7 +381,7 @@ class acpp_config:
     self._targets = None
     self._cxx_path = None
     self._clang_path = None
-    
+
     for arg in self._args:
       if self._is_acpp_arg(arg):
         self._acpp_args.append(arg)
@@ -383,22 +389,22 @@ class acpp_config:
         self._acpp_args.append(self._upgrade_legacy_arg(arg))
       else:
         self._forwarded_args.append(arg)
-    
+
     for envvar in os.environ:
       if self._is_acpp_envvar(envvar):
         self._acpp_environment_args[envvar] = os.environ[envvar]
       elif self._is_acpp_envvar(self._upgrade_legacy_environ_var(envvar)):
         self._acpp_environment_args[self._upgrade_legacy_environ_var(envvar)] = os.environ[envvar]
-    
+
     config_file_directories = []
 
     install_config_dir = os.path.abspath(
       os.path.join(self.acpp_installation_path,
                   "etc/AdaptiveCpp"))
-    
+
     # TODO try using some more portable path here
     global_config_dir = '/etc/AdaptiveCpp'
-    
+
     if self._is_option_set_to_non_default_value("config-file-dir"):
       config_file_directories.append(self._retrieve_option("config-file-dir"))
     elif os.path.exists(install_config_dir):
@@ -407,7 +413,7 @@ class acpp_config:
       config_file_directories.append(global_config_dir)
     self._config_db = config_db(config_file_directories)
 
-    
+
     self._common_compiler_args = self._get_std_compiler_args()
 
 
@@ -428,7 +434,7 @@ class acpp_config:
       if arg.startswith(accepted_arg + "=") or arg == accepted_arg:
         return True
     return False
-  
+
   def _is_acpp_envvar(self, varname):
     accepted_vars = [self._options[opt].environment for opt in self._options]
     accepted_vars += [self._flags[flag].environment for flag in self._flags]
@@ -481,7 +487,7 @@ class acpp_config:
     for arg in self._acpp_args:
       if arg == flag.commandline:
         return True
-      
+
       if arg.startswith(flag.commandline + "="):
         return self._interpret_flag(arg.split("=")[1])
 
@@ -537,7 +543,7 @@ class acpp_config:
   def _substitute_rocm_template_string(self, template_string):
     return self._substitute_template_string(
       template_string, self._get_rocm_substitution_vars())
-      
+
   def _substitute_cuda_template_string(self, template_string):
     return self._substitute_template_string(
       template_string, self._get_cuda_substitution_vars())
@@ -573,7 +579,7 @@ class acpp_config:
     # Try config db
     if self._config_db.contains_key(opt.config_db):
       return self._config_db.get(opt.config_db)
-  
+
     if not allow_unset:
       raise OptionNotSet("Required command line argument {} or environment variable {} not specified".format(
             opt.commandline, opt.environment))
@@ -600,13 +606,13 @@ class acpp_config:
   def _parse_targets(self, target_arg):
     # Split backends by ;
     platform_substrings = target_arg.replace("'","").replace('"',"").split(';')
-    
+
     result = {}
     for p in platform_substrings:
       platform_target_separated = p.split(':', 1)
       if len(platform_target_separated) > 2 or len(platform_target_separated) == 0:
         raise RuntimeError("Invalid target description: " + p)
-      
+
       platform = platform_target_separated[0].strip().lower()
 
       if not platform in result:
@@ -619,7 +625,7 @@ class acpp_config:
             result[platform].append(t)
 
     return result
-  
+
   def _get_executable_path(self, path):
     normalized_path = shutil.which(path)
     if normalized_path:
@@ -628,21 +634,21 @@ class acpp_config:
 
   @property
   def version(self):
-  
+
     if not self._config_db.contains_key("version-major"):
       raise OptionNotSet("Could not retrieve major version from config file")
     if not self._config_db.contains_key("version-minor"):
       raise OptionNotSet("Could not retrieve major version from config file")
     if not self._config_db.contains_key("version-patch"):
       raise OptionNotSet("Could not retrieve major version from config file")
-    
+
     # version suffix may be empty if git queries fail
     suffix = ""
     if self._config_db.contains_key("version-suffix"):
       suffix = self._config_db.get("version-suffix")
 
     return (
-      self._config_db.get("version-major"), 
+      self._config_db.get("version-major"),
       self._config_db.get("version-minor"),
       self._config_db.get("version-patch"),
       suffix)
@@ -671,7 +677,7 @@ class acpp_config:
 
   @property
   def targets(self):
-    
+
     if self._targets == None:
       raw_target_string = ""
       try:
@@ -689,11 +695,11 @@ class acpp_config:
           if platform in hip_platform_synonyms:
             target_arch = self._retrieve_option("gpu-arch")
             raw_target_string = "hip:" + target_arch
-            
+
           elif platform in cuda_platform_synonyms:
             target_arch = self._retrieve_option("gpu-arch")
             raw_target_string = "cuda:" + target_arch
-            
+
           elif platform in pure_cpu_platform_synonyms:
             raw_target_string = "omp"
         except OptionNotSet:
@@ -742,8 +748,6 @@ class acpp_config:
   def acpp_plugin_path(self):
     if sys.platform.startswith('win32'):
       return os.path.join(self.acpp_installation_path, "bin", "acpp-clang.dll")
-    elif sys.platform == "darwin":
-      return os.path.join(self.acpp_installation_path, "lib", "libacpp-clang.dylib")
     else:
       return os.path.join(self.acpp_installation_path, "lib", "libacpp-clang.so")
 
@@ -798,13 +802,20 @@ class acpp_config:
     except OptionNotSet:
       return False
 
+  @property
+  def is_dryrun_only_std_flags(self):
+    try:
+      return self._is_flag_set("is-dryrun-only-std-flags")
+    except OptionNotSet:
+      return False
+      
   @property
   def use_accelerated_cpu(self):
     try:
       return self._is_flag_set("use-accelerated-cpu")
     except OptionNotSet:
       return False
-  
+
   @property
   def is_explicit_multipass(self):
     try:
@@ -812,6 +823,13 @@ class acpp_config:
     except OptionNotSet:
       return False
 
+  @property
+  def is_export_all(self):
+    try:
+      return self._is_flag_set("is-export-all")
+    except OptionNotSet:
+      return False
+
   @property
   def is_stdpar(self):
     try:
@@ -832,7 +850,7 @@ class acpp_config:
       return self._is_flag_set("stdpar-unconditional-offload")
     except OptionNotSet:
       return False
-  
+
   @property
   def stdpar_prefetch_mode(self):
     return self._retrieve_option("stdpar-prefetch-mode")
@@ -887,7 +905,7 @@ class acpp_config:
         if ending.isnumeric() or ending in ["s", "fast", "g"]:
           return True
     return False
-  
+
   def contains_linking_stage(self):
     for arg in self.forwarded_compiler_arguments:
       if (arg == "-E" or
@@ -915,10 +933,53 @@ class acpp_config:
   def is_pure_linking_stage(self):
     return len(self.source_file_arguments) == 0
 
-def run_or_print(command, print_only):
+def filter_cmd_args(command, verbose = False):
+  new_cmd = []
+
+  whitelist = [
+    "-I", "-D", "-W", "-std=","-pedantic-errors"
+  ]
+
+  # you can cheat the handling of -I<include> & -I <include> by treating
+  # enable_next first with "-I " as matcher then the other with "-I" matcher
+  whitelist_enable_next = [
+    "-isystem", "-o", "-c", "-I "
+  ]
+
+  add_next_arg = True # to add clang call
+  for arg in command: 
+    if add_next_arg:
+      new_cmd.append(arg)
+      add_next_arg = False
+      continue
+    
+    for w in whitelist_enable_next:
+      if arg.startswith(w):
+        new_cmd.append(arg)
+        add_next_arg = True
+        continue
+
+    for w in whitelist:
+      if arg.startswith(w):
+        new_cmd.append(arg)
+        add_next_arg = False
+        continue
+
+  if verbose:
+    for arg in command:
+      if not arg in new_cmd:
+        print("removed :",arg)
+
+  return new_cmd
+
+
+def run_or_print(command, print_only, only_std_flags=False):
+
   if not print_only:
     return subprocess.call(command)
   else:
+    if(only_std_flags):
+      command = filter_cmd_args(command,verbose=False)
     print(' '.join(command))
     return 0
 
@@ -976,12 +1037,12 @@ class cuda_multipass_invocation:
         'Unnamed kernel lambdas are unsupported in this configuration because the selected host compiler '
         +self._host_compiler+' does not match the device compiler of the backend '+self.get_device_compiler()
         ]
-      
+
 
     return {
       'requires-extended-host-pass' : requires_extended_host_pass,
       'extended-host-pass-providers' : [
-        'cuda.explicit-multipass', 'hip.explicit-multipass', 
+        'cuda.explicit-multipass', 'hip.explicit-multipass',
         'cuda.integrated-multipass', 'hip.integrated-multipass'],
       'conflicts' : [],
       'caveats' : caveats
@@ -1046,7 +1107,7 @@ class cuda_multipass_invocation:
     for target,ptx in zip(targets, ptx_content):
       target_node = header.hcf_object.root.make_subnode(target)
       header.hcf_object.attach_text_content(target_node, ptx)
-    
+
     header.write_header(self._integration_header)
 
 class hip_multipass_invocation:
@@ -1103,12 +1164,12 @@ class hip_multipass_invocation:
         'Unnamed kernel lambdas are unsupported in this configuration because the selected host compiler '
         +self._host_compiler+' does not match the device compiler of the backend '+self.get_device_compiler()
         ]
-      
+
 
     return {
       'requires-extended-host-pass' : requires_extended_host_pass,
       'extended-host-pass-providers' : [
-        'cuda.explicit-multipass', 'hip.explicit-multipass', 
+        'cuda.explicit-multipass', 'hip.explicit-multipass',
         'cuda.integrated-multipass', 'hip.integrated-multipass'],
       'conflicts' : [],
       'caveats' : caveats
@@ -1171,7 +1232,7 @@ class hip_multipass_invocation:
     for target,hipfb in zip(targets, hipfb_content):
       target_node = header.hcf_object.root.make_subnode(target)
       header.hcf_object.attach_binary_content(target_node, hipfb)
-    
+
     header.write_header(self._integration_header)
 
 
@@ -1228,7 +1289,7 @@ class cuda_invocation:
 
     if not sys.platform.startswith("win32"):
       flags += ["-fpass-plugin=" + self._acpp_plugin_path]
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1282,7 +1343,7 @@ class cuda_nvcxx_invocation:
     except OptionNotSet:
       # nvc++ can handle not setting targets explicitly
       pass
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1330,13 +1391,13 @@ class hip_invocation:
         "-fplugin=" + self._acpp_plugin_path,
         "-D__ACPP_CLANG__"
       ]
-    
+
     for t in self._hip_targets:
       flags += ["--cuda-gpu-arch=" + t]
 
     if not sys.platform.startswith("win32"):
       flags += ["-fpass-plugin=" + self._acpp_plugin_path]
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1379,7 +1440,7 @@ class omp_invocation:
   def get_cxx_flags(self):
     flags = ["-D__ACPP_ENABLE_OMPHOST_TARGET__"]
     flags += self._cxx_flags
-      
+
     return flags
 
   def get_linker_flags(self):
@@ -1522,8 +1583,11 @@ class llvm_sscp_invocation:
     flags = ["-D__ACPP_ENABLE_LLVM_SSCP_TARGET__",
             "-Xclang", "-disable-O0-optnone", "-mllvm", "-acpp-sscp"]
 
+    if self._config.is_export_all:
+      flags += ["-mllvm","-acpp-sscp-export-all"]
+
     sscp_compile_opts = []
-    if ("-Ofast" in self._config.forwarded_compiler_arguments or 
+    if ("-Ofast" in self._config.forwarded_compiler_arguments or
       "-ffast-math" in self._config.forwarded_compiler_arguments):
       sscp_compile_opts.append("fast-math")
 
@@ -1549,7 +1613,8 @@ class compiler:
     self._user_args = config.forwarded_compiler_arguments
     self._requires_linking = config.contains_linking_stage()
     self._requires_compilation = not config.is_pure_linking_stage()
-    self._is_dry_run = config.is_dryrun
+    self._is_dry_run = config.is_dryrun or config.is_dryrun_only_std_flags
+    self._only_std_flags = config.is_dryrun_only_std_flags
     self._targets = config.targets
     self._common_compiler_args = config.common_compiler_args
     self._acpp_path = config.acpp_installation_path
@@ -1634,7 +1699,7 @@ class compiler:
         raise RuntimeError("Unknown backend: " + backend)
 
     self._backends += self._multipass_backends
-    
+
     self._host_compiler = self._select_compiler()
     for mb in self._multipass_backends:
       mb.set_host_compiler(self._host_compiler)
@@ -1647,7 +1712,7 @@ class compiler:
 
     self._verify_backend_combinations()
 
-    # Take into account extended host pass requirements for 
+    # Take into account extended host pass requirements for
     # explicit multipass. E.g., CUDA explicit multipass requires
     # -x cuda or -x hip in the host pass.
     # The "extended host pass" concept is an abstraction of the fact
@@ -1671,7 +1736,7 @@ class compiler:
         print_error("backend",b, "appears multiple times in processed target specification")
 
       reqs = b.get_host_pass_requirements()
-      
+
       conflicts = reqs['conflicts']
       caveats = reqs['caveats']
       for c in caveats:
@@ -1681,7 +1746,7 @@ class compiler:
         if c in selected_backends:
           print_error("requested backends",b.unique_name, "and",c,"are incompatible.")
           fatal_error = True
-    
+
     if fatal_error:
       raise RuntimeError("Errors encountered while verifying combination of requested backends.")
 
@@ -1695,12 +1760,12 @@ class compiler:
 
     if host_pass_reqs['requires-extended-host-pass']:
       extended_pass_providers = host_pass_reqs['extended-host-pass-providers']
-      
+
       available_providers = []
       for provider in extended_pass_providers:
         if provider in active_backends:
           available_providers.append(provider)
-      
+
       # If there is already an integrated multipass backend running
       # that already provides the flags, or if an explicit multipass
       # provider is already enabled, there is nothing to do
@@ -1709,7 +1774,7 @@ class compiler:
           return
         elif active_backends[p].is_extended_host_pass_enabled:
           return
-      
+
       # Otherwise, we need to select and enable an explicit multipass
       # provider. Currently we always select the backend we are configuring.
       # TODO make this user configurable, especially if we add HIP explicit multipass
@@ -1758,12 +1823,12 @@ class compiler:
         "-mllvm", "-acpp-stdpar",
         "-include", os.path.join(stdpar_include_path, "detail", "sycl_glue.hpp")
       ]
-      
+
       if self._is_stdpar_system_usm:
         args += ["-mllvm", "-acpp-stdpar-no-malloc-to-usm", "-D__ACPP_STDPAR_ASSUME_SYSTEM_USM__"]
       if self._is_stdpar_unconditional_offload:
         args += ["-D__ACPP_STDPAR_UNCONDITIONAL_OFFLOAD__"]
-      
+
       if self._stdpar_prefetch_mode != None:
         prefetch_mode_string = self._stdpar_prefetch_mode
         prefetch_mode_id = 0
@@ -1780,7 +1845,7 @@ class compiler:
           prefetch_mode_id = 4
         else:
           raise RuntimeError("Invalid value for stdpar-prefetch-mode: "+prefetch_mode_string)
-        
+
         args += ["-D__ACPP_STDPAR_PREFETCH_MODE__="+str(prefetch_mode_id)]
 
     return args + self._common_compiler_args
@@ -1791,7 +1856,7 @@ class compiler:
       "-L"+self._acpp_lib_path,
       "-lacpp-rt"
     ]
-    
+
     if sys.platform == "darwin":
       linker_args.append("-Wl,-rpath")
       linker_args.append(self._acpp_lib_path)
@@ -1831,7 +1896,7 @@ class compiler:
       if priority > compiler_priority:
         compiler_executable = cxx
         compiler_priority = priority
-    
+
     return compiler_executable
 
   def _flag_should_be_unique(self, flag):
@@ -1883,7 +1948,7 @@ class compiler:
       args += ld_flags
 
     return run_or_print([compiler_executable] + args,
-                        self._is_dry_run)
+                        self._is_dry_run, self._only_std_flags)
 
   def run(self):
     temp_prefix = "adaptivecpp-"
@@ -1929,12 +1994,14 @@ def print_usage(config):
   print("--help\n  Print this help message\n")
   print("\nAny other options will be forwarded to the compiler.")
   print("\nNote: Command line arguments take precedence over environment variables.")
+  print("\n\nFor guidance on how to get good performance with AdaptiveCpp, please see")
+  print("\nhttps://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/performance.md")
 
 if __name__ == '__main__':
   if sys.version_info[0] < 3:
     print_error("acpp requires python 3.")
     sys.exit(-1)
-  
+
   filename = os.path.basename(os.path.realpath(__file__))
   if filename == "syclcc":
     print_warning("syclcc is deprecated; please use acpp instead.")
@@ -1968,7 +2035,7 @@ if __name__ == '__main__':
         print_warning("No optimization flag was given, optimizations are "
               "disabled by default. Performance may be degraded. Compile with e.g. -O2/-O3 to "
               "enable optimizations.")
-    
+
     c = compiler(config)
     sys.exit(c.run())
   except Exception as e:
diff --git a/devops/repos/README.md b/devops/repos/README.md
deleted file mode 100644
index 8c4bab79f..000000000
--- a/devops/repos/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# hipSYCL packaging system
-
-Currently the packaging is based around three groups of bash scripts bound together by the `update-repos.sh`, and the `common/init.sh scripts`. We aimed for having most of these scripts available for use separately from the packaging system, and to serve as inspiration.
-
-The three logical groups are installation, package creation, repository creation, and testing. installation and package creation scripts are located in the `install/scripts` directory, repo creation and testing scripts are located in the `devops/repos directory`.
-
-We provide a high level overview of the different functions here please refer to the actual scripts for more detail
-
-## update_repo.sh
-
-This script serves as a wrapper around the different other scripts that are responsible for building packaging and testing. It is usefulness lies in creating a access point for all the functions that are scattered among the different directories.
-
-## record_env_vars.sh
-
-Creates the `~/envs.out` file, based on the current environment.
-
-## create_pkgs.sh
-
-Executes the packaging script for a distro. and moves the finished packages to the staging folder. It has two modes, `hipsycl` and `base` the former only builds the hipSYCL packages later only builds the base packages
-
-## create_repos.sh
-
-Executes the repo creation for a distribution.
-
-## test-packages.sh
-
-Handles testing of the built and deployed packages for a certain backend configuration.
-
-## test-installation.sh
-
-Run tests on a singularity container containing hipSYCLs
-
-## Examples
-
-```
-bash update_repo.sh centos-7 build_base build              # Build base container
-bash update_repo.sh centos-7 build_base spack-install/rocm # Install rocm into base container
-bash update_repo.sh centos-7 build_base spack-install/llvm # Install llvm
-bash update_repo.sh centos-7 package base                  # create base packages for rocm and llvm&boost
-bash update_repo.sh centos-7 package hipsycl               # create hipsycl packages
-bash update_repo.sh centos-7 deploy                        # deploy packages
-bash update_repo.sh centos-7 test 00                       # run build, add_repo install_dep run_test for the test
-bash update_repo.sh centos-7 test 00 build                 # build testing container
-bash update_repo.sh centos-7 test 00 add_repo               # Add hipSYCL repo to testing container
-bash update_repo.sh centos-7 pub_cont                      # Publish containers 
-```
-
-
- 
diff --git a/devops/repos/common/init.sh b/devops/repos/common/init.sh
deleted file mode 100644
index 651043850..000000000
--- a/devops/repos/common/init.sh
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-HIPSYCL_PKG_REPO_STAGE_DIR=${HIPSYCL_PKG_REPO_STAGE_DIR:-./stage}
-export HIPSYCL_GPG_KEY=E967BA09716F870320089583E68CC4B9B2B75080
-
-#Testing
-declare -A install_cmd=( ["archlinux-rolling"]="pacman -Sy --noconfirm hipSYCL" \
-                         ["centos-7"]="yum -y install hipSYCL" \
-                         ["centos-8"]="yum -y install hipSYCL" \
-                         ["ubuntu-18.04"]="apt -y install hipsycl" \
-                         ["ubuntu-20.04"]="apt -y install hipsycl"
-                       )
-
-declare -A cleanup_cmd=( ["archlinux-rolling"]="pacman -Rsn --noconfirm hipSYCL" \
-                         ["centos-7"]="yum -y remove hipSYCL" \
-                         ["centos-8"]="yum -y remove hipSYCL" \
-                         ["ubuntu-18.04"]="apt -y remove hipsycl" \
-                         ["ubuntu-20.04"]="apt -y remove hipsycl"
-                       )
-
-declare -A cleanup_dep=( ["archlinux-rolling"]='pacman -Rsn --noconfirm $(pacman -Qdtq)' \
-                         ["centos-7"]="package-cleanup -y --leaves" \
-                         ["centos-8"]="package-cleanup -y --leaves" \
-                         ["ubuntu-18.04"]="apt -y autoremove" \
-                         ["ubuntu-20.04"]="apt -y autoremove"
-                       )
-
-
-declare -A image_base=( ["archlinux-rolling"]="docker://archlinux:base" \
-                         ["centos-7"]="docker://centos:centos7" \
-                         ["centos-8"]="docker://centos:centos8" \
-                         ["ubuntu-18.04"]="docker://ubuntu:18.04" \
-                         ["ubuntu-20.04"]="docker://ubuntu:20.04" 
-                      )
-
-declare  -A pkg_suffix=( ["ONON"]="-full" ["OFFOFF"]="-omp" ["OFFON"]="-cuda" \
-                         ["ONOFF"]="-rocm")
-
-#Packging
-
-
-declare -A find_built_pkg=( ["archlinux-rolling"]='4.pkg.tar' \
-                            ["centos-7"]='4.rpm'  \
-                            ["centos-8"]='4.rpm'  \
-                            ["ubuntu-18.04"]='\.deb' \
-                      )
-declare -A packaging_script=( ["archlinux-rolling"]="make-archlinux-pkg.sh"    \
-                              ["centos-7"]="make-centos-7-pkg.sh"  \
-                              ["centos-8"]="make-centos-8-pkg.sh"  \
-                              ["ubuntu-18.04"]="make-ubuntu-pkg.sh"  \
-                      )
-declare -A packaging_image=( ["archlinux-rolling"]="archlinux-rolling"    \
-                              ["centos-7"]="centos-7"  \
-                              ["centos-8"]="centos-8"  \
-                              ["ubuntu-18.04"]="ubuntu-18.04"  \
-                              )
-
-declare -A stage_dir=( ["archlinux-rolling"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_arch"    \
-                       ["centos-7"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_centos-7"  \
-                       ["centos-8"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_centos-8"  \
-                       ["ubuntu-18.04"]="$HIPSYCL_PKG_REPO_STAGE_DIR/new_pkg_ubuntu"  \
-                      )
-
-#Repo creation
-declare -A repo_tools_cont=( ["archlinux-rolling"]="arch.sif" \
-                             ["centos-7"]="centos-7.sif" \
-                             ["centos-8"]="centos-7.sif" \
-                             ["ubuntu-18.04"]="ubuntu-18.04.sif" \
-                             ["ubuntu-20.04"]="ubuntu-18.04.sif"
-                       )
-
-declare -A repo_script=( ["archlinux-rolling"]="create_arch_repo.sh" \
-                             ["centos-7"]="create_centos_repo.sh centos-7" \
-                             ["centos-8"]="create_centos_repo.sh centos-8" \
-                             ["ubuntu-18.04"]="create_ubuntu_repo.sh bionic" \
-                             ["ubuntu-20.04"]="create_ubuntu_repo.sh focal"
-                       )
-
-
-
-#distros=( "centos-7" "archlinux-rolling" "ubuntu-18.04" "ubuntu-20.04")
-#build_distros=( "centos-7" "archlinux-rolling" "ubuntu-18.04" ) 
diff --git a/devops/repos/create_pkgs.sh b/devops/repos/create_pkgs.sh
deleted file mode 100644
index 345331dac..000000000
--- a/devops/repos/create_pkgs.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-if [ "$#" -lt 1 ]; then
-  echo "
-  Responsible for creating the packages from the built containers where the hipSYCL stack is installed.
-  Currently there are two modes supported hipsycl and base. hipsycl will only build the base packages, base will build all the 
-  base packages, (rocm, base,)
-  Usage: <distro> <mode>
-  "
-  exit -1
-fi
-distro=$1
-option=${2:-"hipsycl"}
-
-
-HIPSYCL_PKG_BUILD_ROCM=ON
-HIPSYCL_PKG_BUILD_BASE=ON
-HIPSYCL_PKG_BUILD_HIPSYCL=ON
-
-if [ "$option" = "hipsycl" ]; then
-  HIPSYCL_PKG_BUILD_BASE=OFF 
-  HIPSYCL_PKG_BUILD_ROCM=OFF
-elif [ "$option" = "base" ]; then
-  HIPSYCL_PKG_BUILD_HIPSYCL=OFF
-fi
-
-export HIPSYCL_PKG_BUILD_ROCM
-export HIPSYCL_PKG_BUILD_BASE
-export HIPSYCL_PKG_BUILD_HIPSYCL
-
-source $HIPSYCL_PKG_DEVOPS_DIR/common/init.sh
-
-HIPSYCL_PKG_DEVOPS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-HIPSYCL_PKG_SCRIPT_DIR=${HIPSYCL_PKG_SCRIPT_DIR:-../../install/scripts/}
-HIPSYCL_PKG_SCRIPT_DIR_ABS=$HIPSYCL_PKG_DEVOPS_DIR/$HIPSYCL_PKG_SCRIPT_DIR
-export HIPSYCL_PACKAGING_DIR="/tmp/hipsycl-packages-$distro"
-cd $HIPSYCL_PKG_SCRIPT_DIR_ABS/packaging
-
-export SINGULARITYENV_HIPSYCL_PACKAGING_DIR=$HIPSYCL_PACKAGING_DIR
-
-stage_dir=${stage_dir[$distro]}
-
-singularity exec $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-${packaging_image[$distro]} bash ${packaging_script[$distro]}
-
-mkdir -p $HIPSYCL_PKG_DEVOPS_DIR/$stage_dir
-for file in `find /tmp/hipsycl-packages-$distro | grep ${find_built_pkg[$distro]}`; do
-  mv $file $HIPSYCL_PKG_DEVOPS_DIR/$stage_dir/
-done
-rm -rf $SINGULARITYENV_HIPSYCL_PACKAGING_DIR
\ No newline at end of file
diff --git a/devops/repos/create_repos.sh b/devops/repos/create_repos.sh
deleted file mode 100644
index 9f263ed96..000000000
--- a/devops/repos/create_repos.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-set -o xtrace
-distro=$1
-if [ -z $1 ]; then
-  echo "Provide the name of the distro as the first command line argument"
-  exit -1
-fi
-
-HIPSYCL_PKG_DEVOPS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-source $HIPSYCL_PKG_DEVOPS_DIR/common/init.sh
-
-cd $HIPSYCL_PKG_DEVOPS_DIR
-SINGULARITY_BASE_DIR=${SINGULARITY_BASE_DIR:-./containers}
-HIPSYCL_PKG_REPO_BASE_DIR=${HIPSYCL_PKG_REPO_BASE_DIR:-/data/repos}
-HIPSYCL_PKG_SCRIPT_DIR=${HIPSYCL_PKG_SCRIPT_DIR:-$HIPSYCL_PKG_DEVOPS_DIR/repo-creation-scripts}
-mkdir -p $HIPSYCL_PKG_REPO_BASE_DIR
-
-echo "$HIPSYCL_PKG_REPO_BASE_DIR"
-singularity -d exec --fakeroot -B $HIPSYCL_PKG_REPO_BASE_DIR:/data/repos/ -B $HIPSYCL_PKG_DEVOPS_DIR:$HIPSYCL_PKG_DEVOPS_DIR \
-     $SINGULARITY_BASE_DIR/${repo_tools_cont[$distro]} bash $HIPSYCL_PKG_SCRIPT_DIR/${repo_script[$distro]}
-     
\ No newline at end of file
diff --git a/devops/repos/create_singularity_containers.sh b/devops/repos/create_singularity_containers.sh
deleted file mode 100644
index 4d75802e8..000000000
--- a/devops/repos/create_singularity_containers.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-SINGULARITY_BASE_DIR=${SINGULARITY_BASE_DIR:-./containers/}
-SINGULARITY_DEF_DIR=${SINGULARTIY_DEF_DIR:-./definitions-packaging-container/}
-
-singularity build --fakreoot  $SINGULARITY_BASE_DIR/centos-7.sif  \
-	 $SINGULARITY_DEF_DIR/base-centos-7.def
-
-singularity build --fakreoot $SINGULARITY_BASE_DIR/ubuntu-18.04.sif  \
-  	 $SINGULARITY_DEF_DIR/base-ubuntu-18.04.def
-
-singularity build --fakreoot $SINGULARITY_BASE_DIR/arch.sif    \
-	 $SINGULARITY_DEF_DIR/base-archlinux-rolling.def
diff --git a/devops/repos/definitions-packaging-container/base-archlinux-rolling.def b/devops/repos/definitions-packaging-container/base-archlinux-rolling.def
deleted file mode 100644
index a250d545b..000000000
--- a/devops/repos/definitions-packaging-container/base-archlinux-rolling.def
+++ /dev/null
@@ -1,8 +0,0 @@
-BootStrap: docker
-From: archlinux:base
-
-%setup
-
-%post
-pacman -Sy --noconfirm
-pacman -S --noconfirm grep
diff --git a/devops/repos/definitions-packaging-container/base-centos-7.def b/devops/repos/definitions-packaging-container/base-centos-7.def
deleted file mode 100644
index 76620d7dd..000000000
--- a/devops/repos/definitions-packaging-container/base-centos-7.def
+++ /dev/null
@@ -1,11 +0,0 @@
-BootStrap: docker
-From: centos:centos7
-
-%setup
-
-%files
-
-%environment
-
-%post
-yum -y install gpg vim wget createrepo rpm-sign
diff --git a/devops/repos/definitions-packaging-container/base-ubuntu-18.04.def b/devops/repos/definitions-packaging-container/base-ubuntu-18.04.def
deleted file mode 100644
index f7c8c8b8a..000000000
--- a/devops/repos/definitions-packaging-container/base-ubuntu-18.04.def
+++ /dev/null
@@ -1,13 +0,0 @@
-BootStrap: docker
-From: ubuntu:18.04
-
-%setup
-
-%files
-
-%environment
-
-%post
-apt -y update
-apt -y install dpkg-dev dpkg-sig apt-utils
-
diff --git a/devops/repos/definitions-test-containers/archlinux-rolling.def b/devops/repos/definitions-test-containers/archlinux-rolling.def
deleted file mode 100644
index b84390f14..000000000
--- a/devops/repos/definitions-test-containers/archlinux-rolling.def
+++ /dev/null
@@ -1,9 +0,0 @@
-BootStrap: docker
-From: archlinux:base
-
-%setup
-
-%post
-pacman -Syu --noconfirm
-pacman -Sy --noconfirm awk wget make base-devel
-
diff --git a/devops/repos/definitions-test-containers/centos-7.def b/devops/repos/definitions-test-containers/centos-7.def
deleted file mode 100644
index f754087b5..000000000
--- a/devops/repos/definitions-test-containers/centos-7.def
+++ /dev/null
@@ -1,17 +0,0 @@
-BootStrap: docker
-From: centos:centos7
-
-%setup
-
-%files
-
-%environment
-HIPSYCL_BASE_CC=gcc
-HIPSYCL_BASE_CXX=g++
-. /opt/rh/devtoolset-9/enable
-
-%post
-yum update -y
-yum install epel-release -y
-yum install -y rpm-build sed wget curl patch 
-yum install centos-release-scl -y
\ No newline at end of file
diff --git a/devops/repos/definitions-test-containers/ubuntu-18.04.def b/devops/repos/definitions-test-containers/ubuntu-18.04.def
deleted file mode 100644
index 4ec4fafa4..000000000
--- a/devops/repos/definitions-test-containers/ubuntu-18.04.def
+++ /dev/null
@@ -1,13 +0,0 @@
-BootStrap: docker
-From: ubuntu:18.04
-
-%setup
-
-%files
-
-%environment
-
-%post
-apt update -y 
-apt install -y wget gawk gnupg apt-utils build-essential
-apt install -y software-properties-common
\ No newline at end of file
diff --git a/devops/repos/definitions-test-containers/ubuntu-20.04.def b/devops/repos/definitions-test-containers/ubuntu-20.04.def
deleted file mode 100644
index 9a650f5ab..000000000
--- a/devops/repos/definitions-test-containers/ubuntu-20.04.def
+++ /dev/null
@@ -1,12 +0,0 @@
-BootStrap: docker
-From: ubuntu:20.04
-
-%setup
-
-%files
-
-%environment
-
-%post
-apt update -y 
-apt install -y wget gawk gnupg apt-utils build-essential
\ No newline at end of file
diff --git a/devops/repos/publish_test_container.sh b/devops/repos/publish_test_container.sh
deleted file mode 100644
index 5268b8758..000000000
--- a/devops/repos/publish_test_container.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-set -o xtrace
-set -e
-distro=$1
-cd $HIPSYCL_PKG_DEVOPS_DIR
-commit_hash=`git rev-parse --short HEAD`
-cd -
-date=`date -u +"%Y%m%d"`
-supported_backends="omp"
-HIPSYCL_PKG_PUBLIC_CONTAINER_DIR=${HIPSYCL_PKG_PUBLIC_CONTAINER_DIR:-/data/repos/singularity/}
-HIPSYCL_TEST_DIR=${HIPSYCL_TEST_DIR:-"/data/hipsyclbot/test-dir"}
-mkdir -p $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR
-for backend in `ls $HIPSYCL_TEST_DIR | sed -n -e "s/^.*$distro-//p"`; do
-    if [[ ${backend:0:1} = "1" ]]; then supported_backends="${supported_backends}-rocm"; fi
-    if [[ ${backend:1:2} = "1" ]]; then supported_backends="${supported_backends}-cuda"; fi
-    container_name_base="hipSYCL-${HIPSYCL_PKG_TYPE}-${distro}-${supported_backends}"
-    container_name="${container_name_base}-${date}-${commit_hash}.sif"
-    singularity exec --fakeroot --writable $HIPSYCL_TEST_DIR/hipsycl-$distro-$backend rm -rf /opt/hipSYCL/cuda
-    #On arch these sockets cause a error while packing the container
-    singularity exec --fakeroot --writable $HIPSYCL_TEST_DIR/hipsycl-$distro-$backend rm -rf /etc/pacman.d/gnupg/S.gpg-agent.browser \
-            /etc/pacman.d/gnupg/S.gpg-agent.ssh /etc/pacman.d/gnupg/S.gpg-agent.extra  /etc/pacman.d/gnupg/S.gpg-agent
-    singularity build --force --fakeroot $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR/$container_name $HIPSYCL_TEST_DIR/hipsycl-$distro-$backend
-    supported_backends="omp"
-    #Keep only the two latest container from each kind
-    ls -t $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR | grep $container_name_base-[0-9] | tail -n +3 | xargs -I '_' rm -rf $HIPSYCL_PKG_PUBLIC_CONTAINER_DIR/_
-done
-
-
-
-
-
diff --git a/devops/repos/record_env_vars.sh b/devops/repos/record_env_vars.sh
deleted file mode 100644
index 9e8cbd480..000000000
--- a/devops/repos/record_env_vars.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# This small script is needed for the workflow to work
-# since it is currently not possible to build containers
-# inside containers, we break out from the container containing
-# the workflow to a separate user on our server
-# 
-# The GitHub Action sets the variables in its container
-# we use this script to record the variables
-# and then we copy the variables in a sourceable form 
-# to the user where the actual building will happen.
-
-rm -rf envs.out
-touch envs.out
-for env in `env | grep HIPSYCL`; do
-  echo "export $env" >> envs.out 
-done
-
-
diff --git a/devops/repos/repo-creation-scripts/create_arch_repo.sh b/devops/repos/repo-creation-scripts/create_arch_repo.sh
deleted file mode 100644
index 0a6f3ab73..000000000
--- a/devops/repos/repo-creation-scripts/create_arch_repo.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-
-# We assume that the packages are already signed
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source $DIR/../common/init.sh
-ARCH_REPO_DIR=/data/repos/archlinux/x86_64/
-
-mkdir -p $ARCH_REPO_DIR
-
-cd ${stage_dir["archlinux-rolling"]}
-for f in *.tar.zst
-do
-	mv $f $ARCH_REPO_DIR
-	mv $f.sig $ARCH_REPO_DIR
-	repo-add --sign -k B2B75080 $ARCH_REPO_DIR/hipsycl.db.tar $ARCH_REPO_DIR/$f
-done
-
diff --git a/devops/repos/repo-creation-scripts/create_centos_repo.sh b/devops/repos/repo-creation-scripts/create_centos_repo.sh
deleted file mode 100644
index cec5d77ae..000000000
--- a/devops/repos/repo-creation-scripts/create_centos_repo.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-if [ "$#" -ne "1" ]; then
-	echo "Please specifiy the distro (centos7 or cnetos8) as first argument"
-fi
-distro=$1
-
-declare -A repo_dir=( ["centos-7"]="centos7" \
-                      ["centos-8"]="centos8" \
-					  )
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source $DIR/../common/init.sh
-CENTOS_REPO_DIR=/data/repos/rpm/${repo_dir[$distro]}
-mkdir -p $CENTOS_REPO_DIR
-cd ${stage_dir[$distro]}
-echo $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-for f in *
-do
-	echo $f
-	mv $f $CENTOS_REPO_DIR
-	echo "" | setsid rpmsign --addsign $CENTOS_REPO_DIR/$f
-done
-createrepo $CENTOS_REPO_DIR
-cp $DIR/hipsycl-$distro.repo $CENTOS_REPO_DIR/hipsycl.repo
-echo $DIR
-echo $CENTOS_REPO_DIR
-sed "s|sycl{}|sycl$HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX|" $DIR/hipsycl-$distro.repo > $CENTOS_REPO_DIR/hipsycl.repo
diff --git a/devops/repos/repo-creation-scripts/create_ubuntu_repo.sh b/devops/repos/repo-creation-scripts/create_ubuntu_repo.sh
deleted file mode 100644
index 88cd31af9..000000000
--- a/devops/repos/repo-creation-scripts/create_ubuntu_repo.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source $DIR/../common/init.sh
-
-UBUNTU_REPO_DIR=${UBUNTU_REPO_DIR:-/data/repos/deb}
-DIST=${1:-bionic}
-
-PKG_PATH=$UBUNTU_REPO_DIR/dists/$DIST/main/binary-amd64/
-RELEASE_PATH=$UBUNTU_REPO_DIR/dists/$DIST/
-POOL_PATH=$UBUNTU_REPO_DIR/pool/
-
-mkdir -p $PKG_PATH
-mkdir -p $POOL_PATH
-
-cd ${stage_dir["ubuntu-18.04"]}
-
-for f in *
-do
-	echo $f
-	set +e
-	mv $f $POOL_PATH
-	set -e
-done
-cd $UBUNTU_REPO_DIR 
-# we need the relative path because it will write it directly in Packages
-apt-ftparchive  packages ./pool >  $PKG_PATH/Packages
-
-cd $PKG_PATH
-gzip -k -f $PKG_PATH/Packages || true
-cd $RELEASE_PATH
-echo `pwd`
-apt-ftparchive release .  | tee $RELEASE_PATH/Release
-
-echo `pwd`
-rm -f Release.gpg
-rm -f InRelease
-gpg --batch --no-tty --default-key B2B75080 -abs -o Release.gpg Release
-gpg --batch --no-tty --default-key B2B75080 --clearsign -o InRelease Release
-
diff --git a/devops/repos/repo-creation-scripts/hipsycl-centos-7.repo b/devops/repos/repo-creation-scripts/hipsycl-centos-7.repo
deleted file mode 100644
index 581b507b0..000000000
--- a/devops/repos/repo-creation-scripts/hipsycl-centos-7.repo
+++ /dev/null
@@ -1,7 +0,0 @@
-[hipSYCL-repository]
-name=hipSYCL repository
-baseurl=http://repo.urz.uni-heidelberg.de/sycl{}/rpm/centos7/
-gpgcheck=1
-gpgkey=http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc
-enabled=1
-
diff --git a/devops/repos/test-installation.sh b/devops/repos/test-installation.sh
deleted file mode 100755
index 714f977b9..000000000
--- a/devops/repos/test-installation.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/bin/bash 
-set -e 
-set -o xtrace
-set -xv
-if [ "$#" -ne 3 ]; then
-  echo "
-  This script is responsible for testing the installation inside a built container containing a hipSYCL installation
-  the following tests are built and executed: sycl_tests
-  
-  
-  usage:
-   <dir_of_test_script> <distro> <backend>
-  dir_of_test_scripts: Points to the directory where this script is located
-  distro: The distribution for which the packages are suposed to be tested
-  backend: A bitmask of the enabled backends, from leat to most important bit: CUDA,ROCM. 1 means enabled 0 means disabled
-  
-  Important ENV variables:
-    - HIPSYCL_TEST_DIR: The location where the test containers will be installed
-    - HIPSYCL_TEST_EXCLUDE_FROM_RT: by default set to hip:gfx900. For this backend, we only build the tests.
-  "
-  exit -1
-fi
-cd $1
-distro=$2
-backend=$3
-HIPSYCL_WITH_CUDA="OFF" 
-HIPSYCL_WITH_ROCM="OFF"
-if [[ ${backend:0:1} = "1" ]]; then HIPSYCL_WITH_ROCM="ON"; else HIPSYCL_WITH_ROCM="OFF"; fi
-if [[ ${backend:1:2} = "1" ]]; then HIPSYCL_WITH_CUDA="ON"; else HIPSYCL_WITH_CUDA="OFF"; fi
-source ./common/init.sh
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-
-cmake_path=/opt/hipSYCL/llvm/cmake/bin/cmake
-HIPSYCL_TEST_LOG_DIR=${HIPSYCL_TEST_LOG_DIR:-/tmp/hipsycl-logs}
-mkdir -p $HIPSYCL_TEST_LOG_DIR
-HIPSYCL_TEST_CUDA_ARCH=${HIPSYCL_TEST_CUDA_ARCH:-sm_61}
-HIPSYCL_TEST_ROCM_ARCH=${HIPSYCL_TEST_ROCM_ARCH:-gfx900}
-
-log_file=${log_file:-$HIPSYCL_TEST_LOG_DIR/hipSYCL_image_test-$current_time}
-touch $log_file
-slurm_out=${slurm_out:-$log_file}
-
-targets=( "omp" )
-[ "$HIPSYCL_WITH_CUDA" = "ON" ] && targets+=( "cuda:$HIPSYCL_TEST_CUDA_ARCH" )
-[ "$HIPSYCL_WITH_ROCM" = "ON" ] && targets+=( "hip:$HIPSYCL_TEST_ROCM_ARCH" )
-
-
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-develop}
-
-echo "Testing hipSYCL singularity images at $HIPSYCL_PKG_CONTAINER_DIR for targets ${targets[*]}" >> $log_file
-echo "Cloning form user $HIPSYCL_REPO_USER branch $HIPSYCL_REPO_BRANCH " >> $log_file
-
-HIPSYCL_TEST_EXCLUDE_FROM_RT=${HIPSYCL_TEST_EXCLUDE_FROM_RT:-"hip:gfx900"}
-DIR=`pwd`
-
-mkdir -p /tmp/hipSYCL-test/tests/build 
-mkdir -p /tmp/build/$distro-$backend
-
-for target in ${targets[@]}; do
-  echo "Starting test for $target for $distro" >> $log_file
-  singularity exec --cleanenv  $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro-$backend   \
-      $cmake_path \
-      -DCMAKE_PREFIX_PATH=/opt/hipSYCL/boost/boost \
-      -DCMAKE_C_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang \
-      -DCMAKE_CXX_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang++ \
-      -DHIPSYCL_TARGETS=$target \
-      -S /tmp/hipSYCL-test/tests \
-      -B /tmp/build/$distro-$backend
-   
-
-  VERBOSE=1 CUDA_VISIBLE_DEVICES=0 singularity exec --nv \
-      -H /tmp/build/$distro-$backend $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro-$backend \
-      make  -j 16
-
-  if [ ! "$target" = $HIPSYCL_TEST_EXCLUDE_FROM_RT ] ;then
-    #CUDA_VISIBLE_DEVICES=0 \
-	    singularity exec --nv \
-      $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro-$backend \
-      /tmp/build/$2-$3/sycl_tests 
-  else
-    echo "test_skipped" >> $log_file
-  fi
-  rm -rf /tmp/build/$2-$3
-done
diff --git a/devops/repos/test-packages.sh b/devops/repos/test-packages.sh
deleted file mode 100755
index 385452175..000000000
--- a/devops/repos/test-packages.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash 
-set -e 
-set -o xtrace
-source ~/envs.out
-if [ "$#" -lt 4 ]; then
-  echo "
-  This script is responsible for creating a base image (base), adding the hipSYCL repo (add_repo),
-  installing the hipSYCL package and its dependencies (install_dependencies), and then running the tests
-  (run_tests) and eventually cleaning up (clean_up) for the specified distribution and backend combination
-
-  usage:
-   <dir_of_test_script> <distro> <backends> [action: build, add_repo, intall_dependencies, run_test, clean_up] <target_repo>
-  
-  dir_of_test_scripts: Points to the directory where this script is located
-  distro: The distribution for which the packages are supposed to be tested
-  backends: A bitmask of the enabled backends, from leat to most important bit: CUDA,ROCM. 1 means enabled 0 means disabled
-  actions: build: build a container image for the specified distribution:
-                      it creates an image in the directory: HIPSYCL_TEST_DIR/distro-backend folder
-           add_repo: run the ../../install/scripts/add-repo-<distro>.sh script to add the hipSYCL repo to the base image
-           install_dependencis: Installs the version of hipSYCL with the targeted backends, in case of Cuda backend, is tested, install Cuda from an external source see
-                      ../../install/scripts/spack-install-Cuda.sh
-           run_test: executes the ./test-installation.sh script in the built singularity container.
-           clean_up: Useful if the container is going to be reused. Deletes all installed packages and Cuda if necessary.
-  target_repo: an optional path to the repository directory from the hipSYCL base repo. useful if testing experimental repos
-
-  Important ENV variables:
-    - HIPSYCL_TEST_DIR: The location where the test containers will be installed
-  "
-  exit -1
-fi
-home_dir=$1
-distro=$2
-backends=$3
-action=$4
-target_repo=$5
-
-HIPSYCL_WITH_CUDA="OFF" 
-HIPSYCL_WITH_ROCM="OFF"
-if [[ ${backends:0:1} = "1" ]]; then HIPSYCL_WITH_ROCM="ON"; else HIPSYCL_WITH_ROCM="OFF"; fi
-if [[ ${backends:1:2} = "1" ]]; then HIPSYCL_WITH_CUDA="ON"; else HIPSYCL_WITH_CUDA="OFF"; fi
-
-cd $home_dir
-source ./common/init.sh
-#slurm_out=$1/slurm-$SLURM_JOB_ID.out
-#target_repo=${2:-""}
-
-echo $slurm_out 
-current_time=$(date "+%Y.%m.%d-%H.%M.%S")
-HIPSYCL_TEST_DIR=${HIPSYCL_TEST_DIR:-/tmp/hipsycl-test/}
-echo $HIPSYCL_TEST_DIR
-HIPSYCL_PKG_TYPE=${HIPSYCL_PKG_TYPE:-"-nightly"}
-mkdir -p $HIPSYCL_TEST_DIR
-export slurm_out
-
-dict_key="$HIPSYCL_WITH_ROCM$HIPSYCL_WITH_CUDA"
-echo "Starting comprehensive testing of the package repositories for ${distros[*]}"
-
-if [ "$action" = "build" ];then
-  singularity build --fakeroot --force --sandbox  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends ./definitions-test-containers/$distro.def
-
-
-elif [ "$action" = "add_repo" ]; then
-  singularity exec --fakeroot --writable  -B ../../install/scripts:/mnt \
-    $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends sh /mnt/add-hipsycl-repo/$distro.sh $target_repo 
-
-
-elif [ "$action" = "install_dep" ]; then
-  if [ "$HIPSYCL_WITH_CUDA" = "ON" ]; then
-       singularity exec --fakeroot --writable  -B ../../install/scripts:/mnt \
-         $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends sh /mnt/spack-install/cuda.sh 
-  fi
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends \
-      ${install_cmd[$distro]}${pkg_suffix[$dict_key]}-$HIPSYCL_PKG_TYPE 
-
-
-elif [ "$action" = "run_tests" ]; then
-  export HIPSYCL_WITH_CUDA
-  export HIPSYCL_WITH_ROCM 
-  echo "Start testing" 
-  HIPSYCL_PKG_CONTAINER_DIR=$HIPSYCL_TEST_DIR 
-  export HIPSYCL_PKG_CONTAINER_DIR
-  `pwd`/test-installation.sh `pwd` $distro $backends 
-
-
-elif [ "$action" = "clean_up" ]; then
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends \
-      ${cleanup_cmd[$distro]}${pkg_suffix[$dict_key]}-$HIPSYCL_PKG_TYPE
-  
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends \
-      ${cleanup_dep[$distro]} 
-
-  singularity exec --fakeroot --writable  $HIPSYCL_TEST_DIR/hipsycl-$distro-$backends rm -rf /opt/hipSYCL/cuda
-fi
diff --git a/devops/repos/update_repo.sh b/devops/repos/update_repo.sh
deleted file mode 100644
index 73840b28d..000000000
--- a/devops/repos/update_repo.sh
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/bin/bash
-
-set -e
-set -o xtrace
-if [ $1 = "--help" ]; then
-  echo " 
-   This file is responsible for driving the packaging, building, and testing process for the hipSYCL packaging system.
-   It sets and exports defaults for the important environment variables that might concern the builds 
-   
-   Usage: $ update_repo.sh <distro> <action> [option]
-   distro: centos-7, ubuntu-18.04 etc...
-   action: build_base, build_hipsycl, package, deploy, test" 
-  exit -1
-fi
-
-distro=$1
-action=$2
-option=$3
-set +e
-source /etc/profile
-set -e
-source ${HIPSYCL_PKG_ENV_FILE:-~/envs.out}
-
-HIPSYCL_PKG_DEVOPS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-HIPSYCL_PKG_SCRIPT_DIR=${HIPSYCL_PKG_SCRIPT_DIR:-../../install/scripts/}
-HIPSYCL_PKG_SCRIPT_DIR_ABS=$HIPSYCL_PKG_DEVOPS_DIR/$HIPSYCL_PKG_SCRIPT_DIR
-HIPSYCL_PKG_REPO_BASE_DIR=${HIPSYCL_PKG_REPO_BASE_DIR:-/data/repos/}
-HIPSYCL_PKG_REPO_BASE_DIR=$HIPSYCL_PKG_REPO_BASE_DIR/$HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-HIPSYCL_PKG_PUBLIC_CONTAINER_DIR=${HIPSYCL_PKG_PUBLIC_CONTAINER_DIR:-/data/repos/singularity/}
-source $HIPSYCL_PKG_DEVOPS_DIR/common/init.sh
-
-HIPSYCL_TEST_DIR="/data/hipsyclbot/test-dir"
-mkdir -p $HIPSYCL_TEST_DIR
-
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-stable}
-
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-9}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-1}
-
-HIPSYCL_HIP_VERSION=${HIPSYCL_HIP_VERSION:-4.0.0}
-
-HIPSYCL_PKG_CONTAINER_DIR_SUFFIX=${HIPSYCL_PKG_CONTAINER_DIR_SUFFIX:-containers}
-HIPSYCL_PKG_CONTAINER_DIR_SUFFIX=${HIPSYCL_PKG_CONTAINER_DIR_SUFFIX}${HIPSYCL_PKG_NAME_SUFFIX}
-HIPSYCL_PKG_CONTAINER_DIR_NAME=${HIPSYCL_PKG_LLVM_REPO_BRANCH/release\//llvm-}-
-HIPSYCL_PKG_CONTAINER_DIR=${HIPSYCL_PKG_CONTAINER_DIR:-$HIPSYCL_PKG_SCRIPT_DIR_ABS/${HIPSYCL_PKG_CONTAINER_DIR_NAME}-${HIPSYCL_PKG_CONTAINER_DIR_SUFFIX}}
-HIPSYCL_PKG_TYPE=${HIPSYCL_PKG_TYPE:-nightly}
-
-export HIPSYCL_PKG_CONTAINER_DIR
-export HIPSYCL_PKG_LLVM_REPO_BRANCH
-export HIPSYCL_PKG_LLVM_VERSION_MAJOR
-export HIPSYCL_PKG_LLVM_VERSION_MINOR
-export HIPSYCL_PKG_LLVM_VERSION_PATCH
-export HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-export HIPSYCL_REPO_USER
-export HIPSYCL_REPO_BRANCH
-export HIPSYCL_PKG_TYPE
-export HIPSYCL_PKG_NAME_SUFFIX
-export HIPSYCL_PKG_DEVOPS_DIR
-export HIPSYCL_WITH_CUDA
-export HIPSYCL_WITH_ROCM
-
-
-if [ "$action" = "build_base" ]; then
-  bash $HIPSYCL_PKG_SCRIPT_DIR_ABS/rebuild-images.sh $distro $option
-fi
-
-if [ "$action" = "build_hipsycl" ]; then
-  bash $HIPSYCL_PKG_SCRIPT_DIR_ABS/rebuild-images.sh $distro cleanup
-  bash $HIPSYCL_PKG_SCRIPT_DIR_ABS/rebuild-images.sh $distro $option
-fi
-
-if [ "$action" = "package" ]; then
-  bash $HIPSYCL_PKG_DEVOPS_DIR/create_pkgs.sh $distro $option
-fi
-
-if [ "$action" = "deploy" ]; then 
-  bash $HIPSYCL_PKG_DEVOPS_DIR/create_repos.sh $distro
-fi
-
-if [ "$action" = "test" ]; then
-  if [ -z "${@:4}" ]; then
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option build $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option add_repo $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option install_dep $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option run_tests $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-    rm -rf /data/sbalint/singularity_tmp/*
-  else
-    bash $HIPSYCL_PKG_DEVOPS_DIR/test-packages.sh $HIPSYCL_PKG_DEVOPS_DIR $distro $backend $option $4 $HIPSYCL_PKG_REPO_BASE_DIR_SUFFIX
-  fi
-fi
-
-
-if [ "$action" = "pub_cont" ]; then
-   bash $HIPSYCL_PKG_DEVOPS_DIR/publish_test_container.sh $distro 
-fi
diff --git a/doc/algorithms.md b/doc/algorithms.md
new file mode 100644
index 000000000..278083061
--- /dev/null
+++ b/doc/algorithms.md
@@ -0,0 +1,389 @@
+# AdaptiveCpp parallel algorithms library
+
+AdaptiveCpp ships with a library for common parallel primitives. This library is supported on all backends, with all compiler-based compilation flows. The library-only compilation flows `omp.library-only` and `cuda-nvcxx` are currently unsupported.
+
+The main support target is the generic JIT compiler (`--acpp-targets=generic`).
+
+## Example
+
+```c++
+#include <sycl/sycl.hpp>
+#include <AdaptiveCpp/algorithms/numeric.hpp>
+
+void run_scan(sycl::queue& q, int* device_data_ptr, int* device_output_ptr, 
+            std::size_t problem_size) {
+  // Setup handling for temporary scratch memory. Note: In production work-loads,
+  // the allocation cache should be reused by multiple algorithm invocations for
+  // optimal performance.
+  acpp::algorithms::util::allocation_cache cache{
+    acpp::algorithms::util::allocation_type::device};
+  // Create a handle for the current invocation to manage its allocation requests
+  acpp::algorithms::util::allocation_group scratch{&cache, q.get_device()};
+  // Invoke inclusive_scan
+  auto evt = acpp::algorithms::inclusive_scan(q, scratch, device_data_ptr,
+    device_data_ptr + problem_size, device_output_ptr, sycl::plus<int>{});
+  
+  evt.wait();
+}
+
+```
+
+## Basic concepts
+
+* All algorithms are exclusively supported for the SYCL 2020 USM memory management model (either `device`, `host` or `shared` allocations). The old SYCL `buffer` model is unsupported.
+* All algorithms take a `sycl::queue` to which they submit their operations. Both out-of-order and in-order queues are supported, but we recommend in-order queues for performance and since the library is better tested with in-order queues.
+* All algorithms operate asynchronously, i.e. it is the user's resposibility to synchronize appropriately before results are accessed.
+* All algorithms take an optional `const std::vector<sycl::event>&` argument that can be used to express dependencies.
+* All algorithms return a `sycl::event` which can be used for synchronization. Note: If an algorithm is invoked for a problem size of 0, then for performance reasons it immediately returns a default-constructed `sycl::event` which has a `completed` status. This is the case even if the algorithms has dependencies that are not yet complete!
+* Some algorithms require temporary scratch memory. For performance reasons, this scratch memory is cached. The AdaptiveCpp algorithms library exposes control over allocation lifetime and allocation kind for this scratch memory to users (see below).
+* The iterators passed into the algorithms need to be valid on the target device.
+
+## Allocation cache for scratch memory
+
+
+```c++
+
+namespace acpp::algorithms::util {
+
+/// Encodes which kind of allocations the allocation cache manages
+enum class allocation_type {
+  device, // device USM (sycl::malloc_device())
+  shared, // shared USM (sycl::malloc_shared())
+  host // host USM (sycl::malloc_host())
+};
+
+
+/// The allocation_cache serves as an allocation pool which can serve the
+/// need of algorithms. It releases its memory upon destruction or when purge()
+/// is called. It is the user's responsibility to ensure that neither event
+/// occurs while an algorithm using the allocation cache is still running!
+///
+/// This class is thread-safe, although it might be a good idea to check
+/// whether thread-local allocation caches might result in better performance.
+class allocation_cache {
+public:
+  /// Construct an allocation_cache for a specified memory type
+  allocation_cache(allocation_type alloc_type);
+
+  /// When the allocation_cache is destroyed, all allocations that it manages
+  /// are freed. Users must ensure that the lifetime of the object extends until all operations
+  /// using it have completed.
+  ~allocation_cache();
+
+  /// Explicitly free allocations. Users must ensure that this is not invoked before all
+  /// operations using it have completed.
+  void purge();
+};
+
+/// An allocation_group represents a handle for an algorithm invocation
+/// to manage its temporary scratch memory needs.
+/// In typical scenarios, you will want to use one allocation_group object
+/// per algorithm invocation.
+///
+/// When the allocation_group is destroyed, the allocations that were requested
+/// through it are returned to the parent cache, which might then use them
+/// to serve other requests.
+/// Therefore, users need to
+/// * either ensure that the allocation_group is not destroyed before all algorithms
+///   using it have completed
+/// * or guarantee that allocations may be safely reassigned to other operations while
+///   they are still running, e.g. because all submitted algorithms using the same
+///   allocation_cache are ordered such that no race condition on the scratch allocations may
+///   occur. (Imagine e.g. if all algorithms sharing one allocation_cache are submitted to a single
+///   in-order queue)
+///
+/// This class is not thread-safe.
+class allocation_group {
+public:
+  /// Construct allocation_group for the given cache and device.
+  ///
+  /// The user is responsible to ensure that the lifetime of the provided parent cache
+  /// exceeds the lifetime of this allocation_group.
+  ///
+  /// The device will be used to provide the memory allocation context; for
+  /// typical practical applications it will be the same device that the
+  /// algorithm is submitted to.
+  /// If the memory from this device is not accessible to the device to which
+  /// the algorithm is submitted, the behavior is undefined.
+  allocation_group(allocation_cache *parent_cache, const sycl::device &dev);
+
+  allocation_group() = default;
+  allocation_group(const allocation_group&) = delete;
+  allocation_group& operator=(const allocation_group&) = delete;
+
+  /// Releases all managed allocations to the parent cache to be reassigned to
+  /// other operations.
+  ~allocation_group();
+
+  /// Explicitly releases all managed allocations to the parent cache to be reassigned to
+  /// other operations.
+  ///
+  /// It is the user's responsibility to ensure that this function is not called
+  /// before all managed allocations can be safely returned to the parent cache.
+  void release();
+
+  /// Request new allocation with the specified number of elements of type T.
+  ///
+  /// If the parent allocation cache has an allocation of sufficient size available,
+  /// then it will be returned and made unavailable for other allocation requests.
+  /// Otherwise, a new allocation will be created.
+  template<class T>
+  T* obtain(std::size_t count);
+};
+
+
+}
+```
+
+## Algorithms
+
+The following algorithms are currently supported. Their definition aligns with their definition in the C++ STL. Please refer to the C++ reference of your choice for more information on them.
+
+Here, we will only describe AdaptiveCpp-specific behavior.
+
+### Header `<AdaptiveCpp/algorithms/algorithm.hpp>`
+
+```c++
+namespace acpp::algorithms {
+
+
+template <class ForwardIt, class UnaryFunction2>
+sycl::event for_each(sycl::queue &q, ForwardIt first, ForwardIt last,
+                     UnaryFunction2 f,
+                     const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class Size, class UnaryFunction2>
+sycl::event for_each_n(sycl::queue &q, ForwardIt first, Size n,
+                       UnaryFunction2 f,
+                       const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
+sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
+                      ForwardIt2 d_first, UnaryOperation unary_op,
+                      const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class ForwardIt3,
+          class BinaryOperation>
+sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
+                      ForwardIt2 first2, ForwardIt3 d_first,
+                      BinaryOperation binary_op,
+                      const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2>
+sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                 ForwardIt2 d_first, const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate>
+sycl::event copy_if(sycl::queue &q, util::allocation_group &scratch_allocations,
+                    ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+                    UnaryPredicate pred,
+                    std::size_t *num_elements_copied = nullptr,
+                    const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class Size, class ForwardIt2>
+sycl::event copy_n(sycl::queue &q, ForwardIt1 first, Size count,
+                   ForwardIt2 result,
+                   const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class T>
+sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
+                 const T &value, const std::vector<sycl::event> &deps = {});
+
+template<class ForwardIt, class Size, class T >
+sycl::event fill_n(sycl::queue& q,
+                  ForwardIt first, Size count, const T& value,
+                  const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class Generator>
+sycl::event generate(sycl::queue &q, ForwardIt first, ForwardIt last,
+                     Generator g, const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class Size, class Generator>
+sycl::event generate_n(sycl::queue &q, ForwardIt first, Size count, Generator g,
+                       const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class T>
+sycl::event replace(sycl::queue &q, ForwardIt first, ForwardIt last,
+                    const T &old_value, const T &new_value,
+                    const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt, class UnaryPredicate, class T>
+sycl::event replace_if(sycl::queue &q, ForwardIt first, ForwardIt last,
+                       UnaryPredicate p, const T &new_value,
+                       const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate, class T>
+sycl::event replace_copy_if(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                            ForwardIt2 d_first, UnaryPredicate p,
+                            const T &new_value,
+                            const std::vector<sycl::event> &deps = {});
+
+template <class ForwardIt1, class ForwardIt2, class T>
+sycl::event replace_copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                         ForwardIt2 d_first, const T &old_value,
+                         const T &new_value,
+                         const std::vector<sycl::event> &deps = {});
+
+/// The result of the operation will be stored in out.
+///
+/// out must point to device-accessible memory, and will be set to 0
+/// for a negative result, and 1 for a positive result.
+template <class ForwardIt, class UnaryPredicate>
+sycl::event all_of(sycl::queue &q,
+                   ForwardIt first, ForwardIt last, int* out,
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {});
+
+/// The result of the operation will be stored in out.
+///
+/// out must point to device-accessible memory, and will be set to 0
+/// for a negative result, and 1 for a positive result.
+template <class ForwardIt, class UnaryPredicate>
+sycl::event any_of(sycl::queue &q,
+                   ForwardIt first, ForwardIt last, int* out,
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {});
+
+/// The result of the operation will be stored in out.
+///
+/// out must point to device-accessible memory, and will be set to 0
+/// for a negative result, and 1 for a positive result.
+template <class ForwardIt, class UnaryPredicate>
+sycl::event none_of(sycl::queue &q,
+                   ForwardIt first, ForwardIt last, int* out,
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {});
+
+template <class RandomIt, class Compare>
+sycl::event sort(sycl::queue &q, RandomIt first, RandomIt last,
+                 Compare comp = std::less<>{},
+                 const std::vector<sycl::event>& deps = {});
+
+template< class ForwardIt1, class ForwardIt2,
+          class ForwardIt3, class Compare >
+sycl::event merge(sycl::queue& q,
+                  util::allocation_group &scratch_allocations,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp = std::less<>{},
+                  const std::vector<sycl::event>& deps = {});
+
+}
+
+
+```
+
+### Header `<AdaptiveCpp/algorithms/numeric.hpp>`
+
+```c++
+namespace acpp::algorithms {
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt1, class ForwardIt2, class T, class BinaryReductionOp,
+          class BinaryTransformOp>
+sycl::event
+transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, T *out,
+                 T init, BinaryReductionOp reduce,
+                 BinaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt, class T, class BinaryReductionOp,
+          class UnaryTransformOp>
+sycl::event
+transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 ForwardIt first, ForwardIt last, T* out, T init,
+                 BinaryReductionOp reduce, UnaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt1, class ForwardIt2, class T>
+sycl::event transform_reduce(sycl::queue &q,
+                             util::allocation_group &scratch_allocations,
+                             ForwardIt1 first1, ForwardIt1 last1,
+                             ForwardIt2 first2, T *out, T init,
+                             const std::vector<sycl::event>& deps = {});
+
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt, class T, class BinaryOp>
+sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                   ForwardIt first, ForwardIt last, T *out, T init,
+                   BinaryOp binary_op,
+                   const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt, class T>
+sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                   ForwardIt first, ForwardIt last, T *out, T init,
+                   const std::vector<sycl::event>& deps = {});
+
+/// The result of the reduction will be written to out.
+///
+/// out must point to memory that is accessible on the target device.
+template <class ForwardIt>
+sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
+                   ForwardIt first, ForwardIt last,
+                   typename std::iterator_traits<ForwardIt>::value_type *out,
+                   const std::vector<sycl::event>& deps = {});
+
+
+template <class InputIt, class OutputIt, class BinaryOp>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               T init, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt>
+sycl::event inclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+sycl::event
+exclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class T>
+sycl::event exclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp, class T>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    T init, const std::vector<sycl::event> &deps = {});
+
+template <class InputIt, class OutputIt, class T, class BinaryOp, class UnaryOp>
+sycl::event transform_exclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, T init, BinaryOp binary_op,
+    UnaryOp unary_op, const std::vector<sycl::event> &deps = {});
+
+
+}
+```
\ No newline at end of file
diff --git a/doc/cleanup_syclcchelp.sh b/doc/cleanup_syclcchelp.sh
index a08e296c9..6992c7092 100755
--- a/doc/cleanup_syclcchelp.sh
+++ b/doc/cleanup_syclcchelp.sh
@@ -1,7 +1,7 @@
 #! /bin/bash
 
 # usage: acpp --help | ./cleanup_syclcchelp.sh
-# output is in sylccout then and should be copied into using-hipsycl.md
+# output is in sylccout then and should be copied into using-acpp.md
 
 sed "s/\[current value: .*\]/[current value: NOT SET]/g" > acppout
 
diff --git a/doc/compilation.md b/doc/compilation.md
index be75af9a0..9b7f5541b 100644
--- a/doc/compilation.md
+++ b/doc/compilation.md
@@ -3,18 +3,18 @@
 AdaptiveCpp supports multiple types of compilation flows:
 
 1. **A generic, single-pass compiler infrastructure that compiles kernels to a unified code representation** that is then lowered at runtime to target devices, providing a high degree of portability, low compilation times, flexibility and extensibility. **AdaptiveCpp is the only major SYCL implementation that supports a single-pass compiler design, where the code is only parsed once for both host and target devices**. Support includes:
-   1. NVIDIA CUDA GPUs through PTX;
-   2. AMD ROCm GPUs through amdgcn code;
-   3. Intel GPUs through SPIR-V (Level Zero);
-   4. SPIR-V compatible OpenCL devices supporting Intel USM extensions or fine-grained system SVM (such as Intel's OpenCL implementation for CPUs or GPUs);
-   5. The host CPU through LLVM
+    1. NVIDIA CUDA GPUs through PTX;
+    2. AMD ROCm GPUs through amdgcn code;
+    3. Intel GPUs through SPIR-V (Level Zero);
+    4. SPIR-V compatible OpenCL devices supporting Intel USM extensions or fine-grained system SVM (such as Intel's OpenCL implementation for CPUs or GPUs);
+    5. The host CPU through LLVM
 2. Interoperability-focused multipass compilation flows. **AdaptiveCpp can aggregate existing clang toolchains and augment them with support for SYCL constructs**. This allows for a high degree of interoperability between SYCL and other models such as CUDA or HIP. For example, in this mode, the AdaptiveCpp CUDA and ROCm backends rely on the clang CUDA/HIP frontends that have been augmented by AdaptiveCpp to *additionally* also understand other models like SYCL. This means that the AdaptiveCpp compiler can not only compile SYCL code, but also CUDA/HIP code *even if they are mixed in the same source file*, making all CUDA/HIP features - such as the latest device intrinsics - also available from SYCL code ([details](hip-source-interop.md)). Additionally, vendor-optimized template libraries such as rocPRIM or CUB can also be used with AdaptiveCpp. This allows for highly optimized code paths in SYCL code for specific devices. Support includes:
-   1. Any LLVM-supported CPU (including e.g. x86, arm, power etc) through the regular clang host toolchain with dedicated compiler transformation to accelerate SYCL constructs;
-   2. NVIDIA CUDA GPUs through the clang CUDA toolchain;
-   3. AMD ROCm GPUs through the clang HIP toolchain
+    1. Any LLVM-supported CPU (including e.g. x86, arm, power etc) through the regular clang host toolchain with dedicated compiler transformation to accelerate SYCL constructs;
+    2. NVIDIA CUDA GPUs through the clang CUDA toolchain;
+    3. AMD ROCm GPUs through the clang HIP toolchain
 3. Or **AdaptiveCpp can be used in library-only compilation flows**. In these compilation flows, AdaptiveCpp acts as a C++ library for third-party compilers. This can have portability advantages or simplify deployment. This includes support for:
-   1. Any CPU supported by any OpenMP compilers;
-   2. NVIDIA GPUs through CUDA and the NVIDIA nvc++ compiler, bringing NVIDIA vendor support and day 1 hardware support to the SYCL ecosystem
+    1. Any CPU supported by any OpenMP compilers;
+    2. NVIDIA GPUs through CUDA and the NVIDIA nvc++ compiler, bringing NVIDIA vendor support and day 1 hardware support to the SYCL ecosystem
 
 The following illustration shows the complete stack and its capabilities to target hardware:
 ![Compiler stack](img/stack.png)
@@ -37,9 +37,7 @@ The generic SSCP flow can potentially provide very fast compile times, very good
 
 ### Implementation status
 
-The SSCP flow is supported for all backends.
-
-Some features (e.g. SYCL 2020 reductions or group algorithms) are not yet implemented.
+The SSCP flow is supported for all backends. The set of supported features is a strict superset of the features of other compilation flows. The only exception to this is the ability to mix-and-match SYCL with other backend-specific programming models.
 
 ### How it works
 
@@ -68,14 +66,14 @@ AdaptiveCpp allows using backend-specific language extensions (e.g. CUDA/HIP C++
 
 * If a backend runs on a compiler that provides a unified, single compilation pass for both host and device, backend-specific language extensions are always available. Currently this only affects the CUDA-nvc++ backend.
 * If the compiler relies on separate compilation passes for host and device:
-  * In device compilation passes, backend-specific language extensions are always available.
-  * In host compilation passes, the following applies:
-    * If the backend runs in integrated multipass mode, backend-specific language extensions are available.
-    * If the backend runs in explicit multipass mode:
-      * For SPIR-V, language extensions are always available
-      * For CUDA and HIP: Language extensions from *one* of them are available in the host pass.
-        * If one of them runs in integrated multipass and one in explicit multipass, language extensions from the one in integrated multipass are available
-        * If both are in explicit multipass, `acpp` will currently automatically pick one that will have language extensions enabled in the host pass.
+    * In device compilation passes, backend-specific language extensions are always available.
+    * In host compilation passes, the following applies:
+        * If the backend runs in integrated multipass mode, backend-specific language extensions are available.
+        * If the backend runs in explicit multipass mode:
+            * For SPIR-V, language extensions are always available
+            * For CUDA and HIP: Language extensions from *one* of them are available in the host pass.
+                * If one of them runs in integrated multipass and one in explicit multipass, language extensions from the one in integrated multipass are available
+                * If both are in explicit multipass, `acpp` will currently automatically pick one that will have language extensions enabled in the host pass.
 
 
 ## Summary of supported compilation targets
@@ -115,7 +113,7 @@ approach is employed to achieve good performance and functional correctness (_Ka
 A deep dive into how the implementation works and why this approach was chosen
 can be found in Joachim Meyer's [master thesis](https://joameyer.de/hipsycl/Thesis_JoachimMeyer.pdf).
 
-For more details, see the [installation instructions](installing.md) and the documentation [using AdaptiveCpp](using-hipsycl.md).
+For more details, see the [installation instructions](installing.md) and the documentation [using AdaptiveCpp](using-acpp.md).
 
 ## acpp compilation driver
 
diff --git a/doc/env_variables.md b/doc/env_variables.md
index 714cc2e8c..924019591 100644
--- a/doc/env_variables.md
+++ b/doc/env_variables.md
@@ -31,4 +31,49 @@
 * `ACPP_APPDB_DIR`: By default, AdaptiveCpp stores its application db (which in particular includes the per-app JIT cache) in `$HOME/.acpp`. This environment variable can be used to override the location.
 * `ACPP_JITOPT_IADS_RELATIVE_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): When the same argument has been passed into the kernel for this fraction of all invocations of the kernel, a new kernel will be JIT-compiled with the argument value hard-wired as constant. Not taken into account for the first application run. Default: 0.8.
 * `ACPP_JITOPT_IADS_RELATIVE_THRESHOLD_MIN_DATA`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): Only consider kernels with at least many invocations for the relative threshold described above. Default: 1024.
-* `ACPP_JITOPT_IADS_RELATIVE_EVICTION_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): If the relative frequency of a kernel argument value falls below this threshold, the statistics entry for the the argument value may be evicted if space for other values is needed.
\ No newline at end of file
+* `ACPP_JITOPT_IADS_RELATIVE_EVICTION_THRESHOLD`: JIT-time optimization *invariant argument detection & specialization* (active if `ACPP_ADAPTIVITY_LEVEL >= 2`): If the relative frequency of a kernel argument value falls below this threshold, the statistics entry for the the argument value may be evicted if space for other values is needed.
+* `ACPP_ALLOCATION_TRACKING`: If set to 1, allows the AdaptiveCpp runtime to track and register the allocations that it manages. This enables additional JIT-time optimizations. Set to 0 to disable. (Default: 0)
+
+## Environment variables to control dumping IR during JIT compilation
+
+AdaptiveCpp can dump the IR of the code during stage 2 compilation (JIT compilation) at various stages in the processing and optimization pipeline.
+This feature only applies to the AdaptiveCpp generic JIT SSCP compiler (`--acpp-targets=generic`).
+
+It is primarily helpful for AdaptiveCpp developers for debugging or expert users who wish to understand how their input code is translated and processed in LLVM IR.
+
+These environment variables take the shape `ACPP_S2_DUMP_IR_<Stage>` for various stages in the optimization process.
+* If the variable is set to `1`, the IR will be stored in `<original-sourcefile>.ll`.
+* Otherwise, the content is interpreted as a filepath were the IR will be written to.
+
+Within one application run, AdaptiveCpp appends IR dumps to the dump file. When a new application run results in new dumps being generated to the same file, the file will be truncated first.
+
+Available stages for dumping:
+
+* `ACPP_S2_DUMP_IR_INPUT` - dumps the raw, unoptimized generic input LLVM IR
+* `ACPP_S2_DUMP_IR_INITIAL_OUTLINING` - After initial kernel outlining
+* `ACPP_S2_DUMP_IR_SPECIALIZATION` - After applying specializations to the kernel
+* `ACPP_S2_DUMP_IR_REFLECTION` - After processing JIT-time reflection queries
+* `ACPP_S2_DUMP_IR_JIT_OPTIMIZATIONS` - After processing optimizations that rely on JIT-time information
+* `ACPP_S2_DUMP_IR_BACKEND_FLAVORING` - After applying the "backend flavor", i.e. turning generic LLVM IR into IR that targets a specific backend.
+* `ACPP_S2_DUMP_IR_BUILTIN_REFLECTION` - After second run of JIT-time reflection pass; particularly affects reflection use inside AdaptiveCpp builtins.
+* `ACPP_S2_DUMP_IR_FULL_OPTIMIZATIONS` - After running the full LLVM optimization pipeline on the code.
+* `ACPP_S2_DUMP_IR_FINAL` - Final state of the LLVM IR before handing it off to lowering it to backend-specific formats (e.g. PTX, amdgcn ISA, SPIR-V).
+* `ACPP_S2_DUMP_IR_ALL` - Dump all stages.
+
+
+A dump section for a stage in the dump file will take the following form:
+```
+;---------------- Begin AdaptiveCpp IR dump --------------
+; AdaptiveCpp SSCP S2 IR dump; Compiling kernels: (KERNELS), stage: (STAGENAME)
+
+(LLVM code here)
+;----------------- End AdaptiveCpp IR dump ---------------
+```
+`(STAGENAME)` refers to to one of the stages listed above. `(KERNELS)` is an identifier that describes which kernels AdaptiveCpp is compiling in this IR. It contains the mangled function name of the kernels.
+
+In general, the dump file will contain multiple dump sections if dumping is enabled for multiple stages, or if multiple JIT compilations are triggered (e.g. multiple kernels are launched).
+
+If `ACPP_S2_DUMP_IR_FILTER` filter is non-empty, AdaptiveCpp will only dump IR if the kernel identifier corresponds to the one specified in this variable.
+Note that this can still lead to multiple JIT compilation dumps, e.g. if AdaptiveCpp generates multiple specialized kernels based on runtime information for one C++ kernel.
+
+
diff --git a/doc/examples.md b/doc/examples.md
index 79b64fe88..7c1c57ad8 100644
--- a/doc/examples.md
+++ b/doc/examples.md
@@ -3,41 +3,42 @@ The following code adds two vectors:
 #include <cassert>
 #include <iostream>
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 using data_type = float;
 
-std::vector<data_type> add(cl::sycl::queue& q,
+std::vector<data_type> add(sycl::queue& q,
                            const std::vector<data_type>& a,
-                           const std::vector<data_type>& b)
-{
+                           const std::vector<data_type>& b) {
   std::vector<data_type> c(a.size());
 
   assert(a.size() == b.size());
-  cl::sycl::range<1> work_items{a.size()};
-
-  {
-    cl::sycl::buffer<data_type> buff_a(a.data(), a.size());
-    cl::sycl::buffer<data_type> buff_b(b.data(), b.size());
-    cl::sycl::buffer<data_type> buff_c(c.data(), c.size());
-
-    q.submit([&](cl::sycl::handler& cgh){
-      auto access_a = buff_a.get_access<cl::sycl::access::mode::read>(cgh);
-      auto access_b = buff_b.get_access<cl::sycl::access::mode::read>(cgh);
-      auto access_c = buff_c.get_access<cl::sycl::access::mode::write>(cgh);
-
-      cgh.parallel_for<class vector_add>(work_items,
-                                         [=] (cl::sycl::id<1> tid) {
-        access_c[tid] = access_a[tid] + access_b[tid];
-      });
-    });
-  }
+
+  data_type* dev_a = sycl::malloc_device<data_type>(a.size(), q);
+  data_type* dev_b = sycl::malloc_device<data_type>(a.size(), q);
+  data_type* dev_c = sycl::malloc_device<data_type>(a.size(), q);
+
+  q.memcpy(dev_a, a.data(), sizeof(data_type) * a.size());
+  q.memcpy(dev_b, b.data(), sizeof(data_type) * b.size());
+  q.memcpy(dev_c, c.data(), sizeof(data_type) * c.size());
+
+  q.parallel_for(a.size(), [=](sycl::id<1> idx){
+    dev_c[idx] = dev_a[idx] + dev_b[idx];
+  });
+
+  q.memcpy(c.data(), dev_c, sizeof(data_type) * c.size());
+  q.wait();
+
+  sycl::free(dev_a, q);
+  sycl::free(dev_b, q);
+  sycl::free(dev_c, q);
+
   return c;
 }
 
 int main()
 {
-  cl::sycl::queue q;
+  sycl::queue q{sycl::property::queue::in_order{}};
   std::vector<data_type> a = {1.f, 2.f, 3.f, 4.f, 5.f};
   std::vector<data_type> b = {-1.f, 2.f, -3.f, 4.f, -5.f};
   auto result = add(q, a, b);
@@ -47,4 +48,4 @@ int main()
     std::cout << x << std::endl;
 }
 
-```
\ No newline at end of file
+```
diff --git a/doc/extensions.md b/doc/extensions.md
index 455899b4a..726d952b8 100644
--- a/doc/extensions.md
+++ b/doc/extensions.md
@@ -4,6 +4,165 @@ AdaptiveCpp implements several extensions that are not defined by the specificat
 
 ## Supported extensions
 
+### `ACPP_EXT_RESTRICT_PTR`
+
+Provides a wrapper type that hints to the compiler that a pointer kernel argument does not alias other pointer arguments.
+*Note:* This currently only has an effect with AdaptiveCpp's generic JIT compiler (`--acpp-targets=generic`), other compilation flows ignore this hint.
+
+Example:
+
+```c++
+
+sycl::queue q;
+float* data = ...
+sycl::AdaptiveCpp_restrict_ptr<float> restrict_data = data;
+
+q.parallel_for(range, [=](auto idx){
+  // Converts implicitly to the underlying pointer type - float* in this
+  // example.
+  restrict_data[idx] *= 1.5f;
+});
+```
+
+### `ACPP_EXT_JIT_COMPILE_IF`
+
+Allows for specializing code based on target properties only known at JIT time. This is only supported with AdaptiveCpp's default generic JIT compiler (`--acpp-targets=generic`).
+If you also want to support other compilation flows, use of the following APIs must
+be guarded using `__acpp_if_target_sscp()`.
+
+#### Example
+```c++
+namespace jit = sycl::AdaptiveCpp_jit;
+
+__acpp_if_target_sscp(
+  jit::compile_if(
+    jit::reflect<jit::reflection_query::target_vendor_id>() == 
+      jit::vendor_id::nvidia,
+    [](){
+      // Will only be included in the JIT-compiled kernel if the target is NVIDIA hardware.
+      // The branching will be evaluated at JIT-time; there will be no runtime overhead
+      // in the generated kernel.
+      //
+      // As such, this mechanism can also be used to guard code that is unsupported or
+      // does not compile correctly on other hardware.
+    });
+);
+
+```
+
+#### API reference
+
+```c++
+namespace sycl::AdaptiveCpp_jit {
+
+/// JIT reflection API
+
+enum class compiler_backend : int {
+  spirv,
+  ptx,
+  amdgpu,
+  host
+};
+
+namespace vendor_id {
+
+// These vendor ids are provided for convenience since
+// they frequently occur; this list is non-exclusive; other
+// vendor_id values might be returned by JIT reflection APIs.
+inline constexpr int nvidia;
+inline constexpr int amd;
+inline constexpr int intel;
+}
+
+///
+/// This namespace defines properties that the JIT compiler can be queried for.
+namespace reflection_query {
+
+/// Vendor id of the target hardware
+/// Return type: int
+struct target_vendor_id;
+
+/// Returns a numeric identifier for the target architecture. For NVIDIA GPUs, this
+/// is the SM architecture (e.g. 86 for sm_86). For AMD GPUs, it is the amdgcn architecture
+/// as an hexadecimal number (e.g. 0x90c for gfx90c).
+/// For other hardware, the query currently returns 0.
+/// Return type: int
+struct target_arch;
+
+/// Returns whether the hardware has independent forward progress for each work item.
+/// Return type: bool
+struct target_has_independent_forward_progress;
+
+/// Returns whether the target is a CPU.
+/// Return type: bool
+struct target_is_cpu;
+
+/// Returns whether the target is a GPU.
+/// Return type: bool
+struct target_is_gpu;
+
+/// Returns the AdaptiveCpp runtime backend that is managing the execution of this kernel.
+/// Return type: int (sycl::backend cast to int)
+struct runtime_backend;
+
+/// Returns the AdaptiveCpp runtime backend that is managing the execution of this kernel.
+/// Return type: compiler_backend
+struct compiler_backend;
+}
+
+/// Evaluates at JIT-time the specified query. Query must be one of the types
+/// defined in AdaptiveCpp_jit::property.
+/// The compiler replaces calls to this function with the return value at JIT-time;
+/// Calls to this function will not remain in the final generated code and not cause runtime
+/// overhead.
+template<class Query>
+auto reflect();
+
+/// Evaluates at JIT-time whether the JIT reflection mechanism supports the specified query.
+/// Currently, all of the queries listed above are supported universally, but in the future
+/// queries might be added that are only supported for certain backends.
+///
+/// Query must be one of the types defined in AdaptiveCpp_jit::property.
+///
+/// The compiler replaces calls to this function with the return value at JIT-time;
+/// Calls to this function will not remain in the final generated code and not cause runtime
+/// overhead.
+template<class Query>
+bool knows();
+
+
+/// Code-generates the callable f only if condition evaluates to true at JIT time.
+///
+/// condition must evaluate to a value known at JIT time, either using compile-time
+/// values or return values from the JIT reflection API.
+///
+/// Because the condition is evaluated at JIT time, no runtime overhead
+/// will be present in the compiled kernel due to branching.
+///
+/// The signature of f is void().
+template<class F>
+void compile_if(bool condition, F&& f);
+
+/// Code-generates the callable if_branch only if condition evaluates to true at JIT time.
+/// Otherwise, the callable else_branch is code-generated.
+///
+/// condition must evaluate to a constant at JIT time, either using compile-time
+/// constants or return values from the JIT reflection API.
+///
+/// Because the condition is evaluated at JIT time, no runtime overhead
+/// will be present in the compiled kernel due to branching.
+///
+/// The signature of if_branch and else_branch is T() for arbitrary types T.
+///
+/// \return If T is not void, compile_if_else() returns the value returned by the
+/// user-provided callable that is invoked.
+template<class F, class G>
+auto compile_if_else(bool condition, F&& if_branch, G&& else_branch);
+
+}
+
+```
+
 ### `ACPP_EXT_DYNAMIC_FUNCTIONS`
 
 This extension allows users to provide functions used in kernels with definitions selected at runtime. We call such functions *dynamic functions*, since their definition will be determined at runtime using the JIT compiler. Once a kernel using dynamic functions has been JIT-compiled, there are no runtime overheads as dynamic functions are hardwired at JIT-time.
@@ -33,7 +192,7 @@ int main() {
   sycl::queue q;
 
   // The dynamic_function_config object stores the JIT-time function mapping information.
-  sycl::jit::dynamic_function_config dyn_function_config;
+  sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
   // Requests calls to execute_operations to be replaced at JIT time
   // with {myfunction1(idx); myfunction2(idx);}
   dyn_function_config.define_as_call_sequence(&execute_operations, {&myfunction1, &myfunction2});
@@ -54,7 +213,7 @@ The AdaptiveCpp runtime maintains a kernel cache that automatically distinguishe
 * It is the user's responsibility to ensure that the `dynamic_function_config` object is kept alive at least until all kernels using it have completed.
 * `dynamic_function_config` is not thread-safe; if one object is shared across multiple threads, it is the user's responsibility to ensure appropriate synchronization.
 * With this extension, the user can exchange kernel code at runtime. This means that in general, the compiler cannot know at compile time anymore which parts of the code need to be part of device code. Therefore, functions  providing the definitions have to be marked as `SYCL_EXTERNAL` to ensure that they are emitted to device code. This can be omitted if the function is invoked from the kernel already at compile time.
-* It is possible to provide a "default definition" for dynamic functions by not just declaring them, but also providing a definition (e.g. in the example above, provide a definition for `execute_operations`). However, in this case, we recommend that the function is marked with `__attribute__((noinline))`. Otherwise, in some cases the compiler might decide to already inline the function early on during the optimization process -- and once, inlined, the JIT compiler no loner sees the function and therefore can no longer find function calls to replace. The `noinline` attribute will have no performance implications once the replacement function definition has been put in place by the JIT compiler. Additionally, if the default function does not actually use the function arguments, the frontend might not actually emit function calls to the dynamic function. It is thus a good idea to use `sycl::jit::arguments_are_used()` to assert that these arguments might e.g. be used by a dynamic function replacement function.
+* It is possible to provide a "default definition" for dynamic functions by not just declaring them, but also providing a definition (e.g. in the example above, provide a definition for `execute_operations`). However, in this case, we recommend that the function is marked with `__attribute__((noinline))`. Otherwise, in some cases the compiler might decide to already inline the function early on during the optimization process -- and once, inlined, the JIT compiler no loner sees the function and therefore can no longer find function calls to replace. The `noinline` attribute will have no performance implications once the replacement function definition has been put in place by the JIT compiler. Additionally, if the default function does not actually use the function arguments, the frontend might not actually emit function calls to the dynamic function. It is thus a good idea to use `sycl::AdaptiveCpp_jit::arguments_are_used()` to assert that these arguments might e.g. be used by a dynamic function replacement function.
 
 With a default function definition, the example above might look like so:
 ```c++
@@ -70,7 +229,7 @@ __attribute__((noinline))
 void execute_operations(int* data, sycl::item<1> idx) {
   // This prevents the compiler from removing calls to execute_operations if it
   // sees that the function cannot actually have any side-effects.
-  sycl::jit::arguments_are_used(data, idx);
+  sycl::AdaptiveCpp_jit::arguments_are_used(data, idx);
 }
 
 int main() {
@@ -78,7 +237,7 @@ int main() {
   int* data = ...;
 
   // The dynamic_function_config object stores the JIT-time function mapping information.
-  sycl::jit::dynamic_function_config dyn_function_config;
+  sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
   // Requests calls to execute_operations to be replaced at JIT time
   // with {myfunction1(idx); myfunction2(idx);}
   // If this is removed, the regular function definition of execute_operations
diff --git a/doc/index.md b/doc/index.md
index 4dd1f7558..52ce492e5 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -1,7 +1,7 @@
 #
-![The Rust Logo](img/logo/logo-color.png)
+![The AdaptiveCpp Logo](img/logo/logo-color.png)
 
-Welcome to the documentation of the AdaptiveCpp !
+Welcome to the documentation of AdaptiveCpp!
 
 <div class="grid cards" markdown>
 
@@ -9,22 +9,21 @@ Welcome to the documentation of the AdaptiveCpp !
 
     ---
 
-    Configure and install AdaptiveCpp
+    Configure and install AdaptiveCpp.
 
     [:octicons-arrow-right-24: Installation](./installing.md)
 
--   :fontawesome-solid-gears:{ .lg .middle } __Can run on any hardware__
+-   :fontawesome-solid-gears:{ .lg .middle } __Can run on hardware from all major vendors__
 
     ---
 
-    We support CPUs, GPUs, from all vendors, either through multipass compilation.
-    Or through our single pass SSCP compiler
+    We support CPUs and GPUs from all major vendors, either through multipass compilation or through our single-pass SSCP compiler.
 
-    [:octicons-arrow-right-24: Usage](./using-hipsycl.md)
+    [:octicons-arrow-right-24: Usage](./using-acpp.md)
 
 
 </div>
 
 !!! note
 
-    This documentation webpage is WIP.
\ No newline at end of file
+    This documentation webpage is still work-in-progress.
diff --git a/doc/install-ocl.md b/doc/install-ocl.md
index 42e9d4656..e4ae24550 100644
--- a/doc/install-ocl.md
+++ b/doc/install-ocl.md
@@ -2,6 +2,9 @@
 
 You will need an OpenCL implementation, and the OpenCL icd loader. The OpenCL library can be specified using `cmake -DOpenCL_LIBRARY=/path/to/libOpenCL.so`.
 
+In order to generate correct code, AdaptiveCpp needs to apply a patch to the Khronos llvm-spirv translator.
+You *must* have the `patch` command installed and available when running the AdaptiveCpp `cmake` configuration. If you have run `cmake` without the `patch` command available, please *clean your build directory* before trying again.
+
 The OpenCL backend can be enabled using `cmake -DWITH_OPENCL_BACKEND=ON` when building AdaptiveCpp.
 In order to run code successfully on an OpenCL device, it must support SPIR-V ingestion and the Intel USM (unified shared memory) extension. In a degraded mode, devices supporting OpenCL fine-grained system SVM (shared virtual memory) may work as well.
 
diff --git a/doc/installing.md b/doc/installing.md
index 3aab66db8..80cc5c935 100644
--- a/doc/installing.md
+++ b/doc/installing.md
@@ -14,8 +14,8 @@ In order to successfully build and install AdaptiveCpp, the following dependenci
 * python 3 (for the `acpp` compiler driver)
 * `cmake`
 * the Boost C++ libraries (in particular `boost.fiber`, `boost.context` and for the unit tests `boost.test`)
-  * it may be helpful to set the `BOOST_ROOT` `cmake` variable to the path to the root directory of Boost you wish to use if `cmake` does not find it automatically
-  * **Note for boost 1.78 users:** There seems to be a bug in the build system for boost 1.78, causing the compiled fiber and context libraries not to be copied to the installation directory. You will have to copy these libraries manually to the installation directory. In binary packages from some distribution repositories this issue is fixed. You might be only affected when building boost manually from source.
+    * it may be helpful to set the `BOOST_ROOT` `cmake` variable to the path to the root directory of Boost you wish to use if `cmake` does not find it automatically
+    * **Note for boost 1.78 users:** There seems to be a bug in the build system for boost 1.78, causing the compiled fiber and context libraries not to be copied to the installation directory. You will have to copy these libraries manually to the installation directory. In binary packages from some distribution repositories this issue is fixed. You might be only affected when building boost manually from source.
 
 In addition, the various supported [compilation flows](compilation.md) and programming models have additional requirements:
 
@@ -106,9 +106,9 @@ The default installation prefix is `/usr/local`. Change this to your liking.
 ###### General
 *  `-DCMAKE_CXX_COMPILER` should be pointed to the C++ compiler to compile AdaptiveCpp with. Note that this also sets the default C++ compiler for the CPU backend when using acpp once AdaptiveCpp is installed. This can however also be modified later using `HIPSYCL_CPU_CXX`.
 * `-DACPP_COMPILER_FEATURE_PROFILE` can be used to configure the desired degree of compiler support. Supported values:
-  * `full` (default and recommended): Enables all AdaptiveCpp features, requires a compatible LLVM installation as described [here](install-llvm.md). This is recommended for both functionality and performance.
-  * `minimal`: Only enables the older interoperability-focused compilation flows for CUDA and HIP (`--acpp-targets=cuda` and `--acpp-targets=hip`). No OpenCL or Level Zero support, no C++ standard parallelism offloading support, no generic JIT compiler (`generic` target), no compiler acceleration for SYCL constructs on CPU device. **Should only be selected in specific circumstances.**
-  * `none`: Disables all compiler support and dependencies on LLVM. In addition to `minimal`, also disables the support for `--acpp-targets=cuda` and `--acpp-targets=hip`. In this mode, AdaptiveCpp operates purely as a library for third-party compilers. **Should only be selected in specific circumstances.**
+    * `full` (default and recommended): Enables all AdaptiveCpp features, requires a compatible LLVM installation as described [here](install-llvm.md). This is recommended for both functionality and performance.
+    * `minimal`: Only enables the older interoperability-focused compilation flows for CUDA and HIP (`--acpp-targets=cuda` and `--acpp-targets=hip`). No OpenCL or Level Zero support, no C++ standard parallelism offloading support, no generic JIT compiler (`generic` target), no compiler acceleration for SYCL constructs on CPU device. **Should only be selected in specific circumstances.**
+    * `none`: Disables all compiler support and dependencies on LLVM. In addition to `minimal`, also disables the support for `--acpp-targets=cuda` and `--acpp-targets=hip`. In this mode, AdaptiveCpp operates purely as a library for third-party compilers. **Should only be selected in specific circumstances.**
 
 ###### generic
 
diff --git a/doc/macros.md b/doc/macros.md
index 481ad60ad..3351002b1 100644
--- a/doc/macros.md
+++ b/doc/macros.md
@@ -68,3 +68,4 @@ Note: Some compiler drivers that AdaptiveCpp supports can compile for multiple b
 * `ACPP_STRICT_ACCESSOR_DEDUCTION` - define when building your SYCL implementation to enforce strict SYCL 2020 accessor type deduction rules. While this might be required for the correct compilation of certain SYCL code, it also disables parts of the AdaptiveCpp accessor variants performance optimization extension. As such, it can have a negative performance impact for code bound by register pressure.
 * `ACPP_ALLOW_INSTANT_SUBMISSION` - define to `1` before including `sycl.hpp` to allow submission of USM operations to in-order queues via the low-latency instant submission mechanism. Set to `0` to prevent the runtime from utilizing the instant submission mechanism. If C++ standard parallelism offloading is enabled, instant submissions are always allowed.
 * `ACPP_FORCE_INSTANT_SUBMISSION` - define to `1` before including `sycl.hpp` to imply `ACPP_ALLOW_INSTANT_SUBMISSION=1` and throw an exception when instant submission is not possible.
+* `ACPP_NO_SHORT_NAMESPACE` - if defined, disables exposing AdaptiveCpp functionality in the `acpp` namespace.
\ No newline at end of file
diff --git a/doc/performance.md b/doc/performance.md
index f02a12a2d..30c4fbb45 100644
--- a/doc/performance.md
+++ b/doc/performance.md
@@ -32,6 +32,41 @@ The other compilation flows `omp`, `cuda`, `hip` should mainly be used when *int
 * If you are unsure, the compilation flags used by clang-based compilers under the hood can be inspected using `<clang invocation> -###`. This also works for AdaptiveCpp's clang-based compilation flows. When in doubt, use this mechanism to align compilation flags between compilers.
 * The compiler invocation that `acpp` generates can be printed and its flags inspected with `--acpp-dryrun`.
 
+## SYCL memory management: USM vs buffers
+
+There are three kinds of unified shared memory (USM) in SYCL:
+
+* host USM (`sycl::malloc_host`). This is device-accessible host memory, similarly to CUDA pinned memory. It is usually only situationally useful, e.g. when the memory is only rarely accessed on GPU and a full data copy might be unnecessary.
+* device USM (`sycl::malloc_device`). This is device-resident memory that is unavailable on the host or other devices. It always stays on that device, and is similar to CUDA's `cudaMalloc`. It provides very low overhead. Explicit data transfers mechanisms need to be invoked by the user to migrate data between host and device. Device USM is usually the most efficient memory for usage on device in SYCL.
+* shared USM (`sycl::malloc_shared`). This is memory that can automatically migrate between host and device, or potentially other devices, similarly to e.g. CUDA's cudaMallocManaged.
+
+AdaptiveCpp supports all forms of USM universally on all backends and supported hardware.
+
+Additionally, SYCL provides the `sycl::buffer`/`sycl::accessor` model.
+
+Generally, SYCL buffers are inferior to USM when it comes to performance:
+* **All types of USM have significantly lower host-side runtime overhead compared to buffers**, and can substantially outperform buffers, especially for short running kernels where submission latencies matter. This is especially true when in-order queues are used (See e.g. this paper for details: https://dl.acm.org/doi/fullHtml/10.1145/3648115.3648120)
+    * With USM, the programmer can express dependencies statically, while the SYCL buffer-accessor model must figure out dependencies at runtime. Similarly, USM allows to statically express allocation/deallocation and data transfers, while with buffers non-trivial mechanisms in the SYCL runtime need to automatically manage these operations. This can add overhead.
+* Buffer accessors are not lightweight objects, and can increase register pressure in kernels compared to USM pointers.
+* Buffers may behave in unexpected ways that can silently introduce performance issues, for example buffer destructors synchronize in certain cases to wait for work to complete.
+
+For shared USM specifically:
+* Performance of shared USM typically depends on memory access patterns and driver quality. Depending on the operating system and hardware, very good performance is also possible with shared USM.
+  * On CPU, shared USM is identical to device USM by design, and consequently there is no performance overhead
+  * Performance on NVIDIA GPUs is typically excellent
+  * On Intel GPUs, performance is typically good, depending on memory access patterns. On dedicated Intel GPUs, note that current hardware and drivers do not support data migration at page granularity, i.e. always entire allocations will be migrated at a time if data is accessed on host/device. This is not an issue on iGPU.
+  * On AMD GPUs, performance can be good, but for some driver/OS/hardware setups may be substantially degraded if the XNACK hardware feature is not available.
+* Performance of shared USM can often be improved using the `queue::prefetch()` performance hint.
+
+
+Shared USM is the most productive memory management model that SYCL has, and can be a great solution for e.g. rapid prototyping or porting CPU code. **Shared USM is also less verbose and more productive than using SYCL buffers!**
+
+
+In summary:
+* **When control and maximum performance is needed, use device USM (`sycl::malloc_device`)**
+* **When maximum productivity is needed, use shared USM (`sycl::malloc_shared`)**
+* **Never use buffers. They do not bring significant advantages compared to USM, but can introduce substantial drawbacks!**
+
 ## Ahead-of-time vs JIT compilation
 
 The compilation targets `omp`, `hip` and `cuda` perform ahead-of-time compilation. This means they depend strongly on the user to provide correct optimization flags when compiling.
@@ -54,6 +89,8 @@ This optimization process is complete when the following warning is no longer pr
 
 The extent of this can be controlled using the environment variable `ACPP_ADAPTIVITY_LEVEL`. A value of 0 disables the feature. The default is 1. Higher levels are expected to result in higher peak performance, but may require more application runs to converge to this performance. The default level of 1 usually guarantees peak performance for the second application run.
 
+Setting `ACPP_ALLOCATION_TRACKING=1` enables additional optimizations at adaptivity level 1.
+
 At adaptivity level >= 2, AdaptiveCpp will enable additional, aggressive optimizations.
 In particular, AdaptiveCpp will attempt to detect invariant kernel arguments, and hardwire those as constants during JIT time. In some cases, this can result in substantial performance increases. It is thus advisable to try setting `ACPP_ADAPTIVITY_LEVEL=2` and running the application a couple of times (typically 3-4 times).
 
@@ -61,6 +98,10 @@ Note: Applications that are highly latency-sensitive may notice a slightly incre
 
 **For peak performance, you should not disable adaptivity, and run the application until the warning above is no longer printed.**
 
+We recommend:
+* Experiment with `ACPP_ADAPTIVITY_LEVEL=1` and `ACPP_ADAPTIVITY_LEVEL=2`
+* Experiment with `ACPP_ALLOCATION_TRACKING=1` and `ACPP_ALLOCATION_TRACKING=0`.
+
 *Note: Adaptivity levels higher than 2 are currently not implemented.*
 
 ### Empty the kernel cache when upgrading the stack
@@ -76,6 +117,7 @@ Clearing the cache can be accomplished by simply clearing the cache directory, e
 * When comparing CPU performance to icpx/DPC++, please note that DPC++ relies on either the Intel CPU OpenCL implementation or oneAPI construction kit to target CPUs. AdaptiveCpp can target CPUs either through OpenMP, or through OpenCL. In the latter case, it can use exactly the same OpenCL implementations that DPC++ uses for CPUs as well. So, if you notice that DPC++ performs better on CPU in some scenario, it might be a good idea to try the Intel OpenCL CPU implementation or the oneAPI construction kit with AdaptiveCpp! Drawing e.g. the conclusion that DPC++ is faster than AdaptiveCpp on CPU but only testing AdaptiveCpp's OpenMP backend is *not* correct reasoning!
 * When targeting the Intel OpenCL CPU implementation, you might also want to take into account [Intel's vectorizer tuning knobs](https://www.intel.com/content/www/us/en/docs/opencl-sdk/developer-guide-core-xeon/2018/vectorizer-knobs.html).
 * For the OpenMP backend, enable OpenMP thread pinning (e.g. `OMP_PROC_BIND=true`). AdaptiveCpp uses asynchronous worker threads for some light-weight tasks such as garbage collection, and these additional threads can interfere with kernel execution if OpenMP threads are not bound to cores.
+* In multi-socket systems or other systems with strong NUMA behavior we recommend running one AdaptiveCpp process per socket (or NUMA domain) and using e.g. MPI to exchange data between the processes. This is because the SYCL implementations for data transfer functionality (`queue::memcpy` etc) for the OpenMP backend are currently not NUMA-aware. If your code depends on fast data transfers, you might run into NUMA issues otherwise. If you don't have performance critical data transfers in your code, this might not matter. Alternatively, on the CPU backend you can always use kernels to copy data which is always expected to deliver good performance.
 
 ### With omp.* compilation flow
 * When using `OMP_PROC_BIND`, there have been observations that performance suffers substantially, if AdaptiveCpp's OpenMP backend has been compiled against a different OpenMP implementation than the one used by `acpp` under the hood. For example, if `omp.accelerated` is used, `acpp` relies on clang and typically LLVM `libomp`, while the AdaptiveCpp runtime library may have been compiled with gcc and `libgomp`. The easiest way to resolve this is to appropriately use `cmake -DCMAKE_CXX_COMPILER=...` when building AdaptiveCpp to ensure that it is built using the same compiler. **If you observe substantial performance differences between AdaptiveCpp and native OpenMP, chances are your setup is broken.**
diff --git a/doc/stdpar.md b/doc/stdpar.md
index d422788da..d5bf3f6ef 100644
--- a/doc/stdpar.md
+++ b/doc/stdpar.md
@@ -42,7 +42,12 @@ Offloading is implemented for the following STL algorithms:
 |`any_of` | |
 |`all_of` | |
 |`none_of` | |
-
+|`merge` | |
+|`sort` | may not scale optimally for large problems |
+|`inclusive_scan` | |
+|`exclusive_scan` | |
+|`transform_inclusive_scan` | |
+|`transform_exclusive_scan` | |
 
 For all other execution policies or algorithms, the algorithm will compile and execute correctly, however the regular host implementation of the algorithm provided by the C++ standard library implementation will be invoked and no offloading takes place.
 
diff --git a/doc/using-hipsycl.md b/doc/using-acpp.md
similarity index 74%
rename from doc/using-hipsycl.md
rename to doc/using-acpp.md
index 14e1ac0f6..995d326ee 100644
--- a/doc/using-hipsycl.md
+++ b/doc/using-acpp.md
@@ -1,24 +1,25 @@
 # Using AdaptiveCpp in projects
-It is recommended to use the CMake integration for larger projects. See the section on the cmake integration below. Alternatively, `acpp` can be used directly as a compiler.
 
-## Using acpp
+It is recommended that the CMake integration be used for larger projects (see the section on the CMake integration below). Alternatively, `acpp` can be used directly as a compiler.
 
-`acpp` can be invoked like a regular compiler (e.g. `acpp -O3 -o test test.cpp`). It supports multiple compilation flows. In a typical installation (i.e. when AdaptiveCpp was built against LLVM >= 14 and the generic SSCP compiler was not explicitly disabled), it uses the `generic` compilation flow by default. This compilation flow usually compiles the fastest, produces the fastest binaries, and its generated binaries can run on all supported devices. **Unless you have have very specific needs, you probably should use the default `generic` compiler.**
+## Using `acpp`
 
-Advanced users or users with more specific needs may want to specify compilation flows explicitly.This is achieved with the `--acpp-targets="compilation-flow1:target1,target2,...;compilation-flow2:..."` command line argument, `ACPP_TARGETS` environment variable or cmake argument.
+`acpp` can be invoked like a regular compiler (e.g. `acpp -O3 -o test test.cpp`). It supports multiple compilation flows. A typical installation (i.e. when AdaptiveCpp was built against LLVM >= 14 and the generic SSCP compiler was not explicitly disabled) uses the `generic` compilation flow by default. This compilation flow usually compiles the quickest, produces the fastest binaries, and its generated binaries can run on all supported devices. **Unless you have very specific needs, you should probably use the default `generic` compiler.**
 
-**Other compilation flows like omp, cuda, hip are typically mostly interesting for backend interoperability use cases, not if performance is the top priority.**.
+Advanced users or users with more specific needs may want to specify compilation flows explicitly. This is achieved with the `--acpp-targets="compilation-flow1:target1,target2,...;compilation-flow2:..."` command line argument, the `ACPP_TARGETS` environment variable or the `ACPP_TARGETS` CMake variable.
+
+**Other compilation flows like `omp`, `cuda`, and `hip` are typically mostly attractive for backend interoperability use cases, not when performance is the primary concern.**
 
 ## AdaptiveCpp targets specification
 
-Both `acpp` and the cmake integration can be optionally provided with an AdaptiveCpp targets specification. This specification defines which compilation flows AdaptiveCpp should enable, and which devices from a compilation flow AdaptiveCpp should target during compilation. In general, it has the form:
+Both `acpp` and the CMake integration can optionally be provided with an AdaptiveCpp targets specification. This specification defines which compilation flows AdaptiveCpp should enable and which devices from a compilation flow AdaptiveCpp should target. In general, it has the form:
 
 ```
 "flow1:target1,target2,...;flow2:...;..."
 ```
-and can be passed either as `acpp` command line argument (`--acpp-targets=...`), environment variable (`ACPP_TARGETS=...`) or CMake argument (`-DACPP_TARGETS=...`) depending on whether `acpp` or `cmake` is used.
+and can be passed either as an `acpp` command line argument (`--acpp-targets=...`), environment variable (`ACPP_TARGETS=...`) or CMake argument (`-DACPP_TARGETS=...`) depending on whether `acpp` or `cmake` is used.
 
-"compilation flow" refers to one of the available compilation flows defined in the [compilation flow](compilation.md) documentation.
+"Compilation flow" refers to one of the available compilation flows defined in the [compilation documentation](compilation.md).
 
 
 ### Requirements for specifying targets of individual compilation flows
@@ -26,57 +27,61 @@ and can be passed either as `acpp` command line argument (`--acpp-targets=...`),
 Whether a compilation flow needs to be followed by a target list or not varies between the available flows and is described below.
 
 For the following compilation flows, targets cannot be specified:
+
 * `omp.*`
 * `generic`
 
 For the following compilation flows, targets can optionally be specified:
+
 * `cuda-nvcxx` - Targets take the format of `ccXY` where `XY` stands for the compute capability of the device.
 
 For the following compilation flows, targets must be specified:
-* `cuda.*` - The target format is defined by clang and takes the format of `sm_XY`. For example:
-  * `sm_52`: NVIDIA Maxwell GPUs
-  * `sm_60`: NVIDIA Pascal GPUs
-  * `sm_70`: NVIDIA Volta GPUs
-* `hip.*` - The target format is defined by clang and takes the format of `gfxXYZ`. For example:
-  * `gfx900`: AMD Vega 10 GPUs (e.g. Radeon Vega 56, Vega 64)
-  * `gfx906`: AMD Vega 20 GPUs (e.g. Radeon VII, Instinct MI50)
-  * `gfx908`: AMD CDNA GPUs (e.g Instinct MI100)
+
+* `cuda.*` - The target format is defined by `clang` and takes the format of `sm_XY`. For example:
+    * `sm_52`: NVIDIA Maxwell GPUs (e.g. GeForce GTX 980, TITAN X)
+    * `sm_61`: NVIDIA Pascal GPUs (e.g. GeForce GTX 1080, TITAN Xp)
+    * `sm_70`: NVIDIA Volta GPUs  (e.g. Tesla V100, TITAN V)
+* `hip.*` - The target format is defined by `clang` and takes the format of `gfxXYZ`. For example:
+    * `gfx900`: AMD Vega 10 GPUs (e.g. Radeon Vega 56, Vega 64)
+    * `gfx906`: AMD Vega 20 GPUs (e.g. Radeon VII, Instinct MI50)
+    * `gfx908`: AMD CDNA GPUs (e.g. Instinct MI100)
 
 ### Abbreviations
 
 For some compilation flows, abbreviations exist that will be resolved by AdaptiveCpp to one of the available compilation flows:
-* `omp` will be translated 
-  * into `omp.accelerated` 
-     * if AdaptiveCpp has been built with support for accelerated CPU and the host compiler is the clang that AdaptiveCpp has been built with or
-     * if `--acpp-use-accelerated-cpu` is set. If the accelerated CPU compilation flow is not available (e.g. AdaptiveCpp has been compiled without support for it), compilation will abort with an error.
-  * into `omp.library-only` otherwise
+
+* `omp` will be translated
+    * into `omp.accelerated`
+        * if AdaptiveCpp has been built with support for accelerated CPU and the host compiler is the `clang` that AdaptiveCpp has been built with or
+        * if `--acpp-use-accelerated-cpu` is set. If the accelerated CPU compilation flow is not available (e.g. AdaptiveCpp has been compiled without support for it), compilation will abort with an error.
+    * into `omp.library-only` otherwise.
 * `cuda` will be translated
-  * into `cuda.explicit-multipass`
-    * if another integrated multipass has been requested, or another backend that would conflict with `cuda.integrated-multipass`. AdaptiveCpp will emit a warning in this case, since switching to explicit multipass can change interoperability guarantees (see the [compilation](compilation.md) documentation).
-    * if `--acpp-explicit-multipass` is set explicitly
-  * into `cuda.integrated-multipass` otherwise
-* `hip` will be translated into `hip.integrated-multipass`
+    * into `cuda.explicit-multipass`
+        * if another integrated multipass has been requested, or another backend that would conflict with `cuda.integrated-multipass`. AdaptiveCpp will emit a warning in this case, since switching to explicit multipass can change interoperability guarantees (see the [compilation documentation](compilation.md)).
+        * if `--acpp-explicit-multipass` is set explicitly.
+    * into `cuda.integrated-multipass` otherwise.
+* `hip` will be translated into `hip.integrated-multipass`.
 
 Of course, the desired flows can also always be specified explicitly.
 
 ### Examples
 
-* `generic` - creates a binary that can run on all backends. This also typically creates the fastest binaries.
-* `"omp.library-only;cuda.explicit-multipass:sm_61;sm_70"`  - compiles for the CPU backend and Pascal and Volta era GPUs
-* `"omp;cuda:sm_70;hip:gfx906"`  - compiles for the CPU backend (library or accelerated), NVIDIA Volta era GPUs via explicit multipass, AMD Vega 20 GPUs
-* `"omp.accelerated;cuda:sm_70`" - compiles for the CPU backend (compiler accelerated) and NVIDIA Volta era GPUs.
-* `"omp;cuda-nvcxx"` - compiles for the CPU backend and NVIDIA GPUs using nvc++
+* `"generic"` - creates a binary that can run on all backends. This also typically creates the fastest binaries.
+* `"omp.library-only;cuda.explicit-multipass:sm_61;sm_70"` - compiles for the CPU backend and Pascal- and Volta-era GPUs.
+* `"omp;cuda:sm_70;hip:gfx906"` - compiles for the CPU backend (library or accelerated), NVIDIA Volta-era GPUs via explicit multipass and AMD Vega 20 GPUs.
+* `"omp.accelerated;cuda:sm_70"` - compiles for the CPU backend (compiler-accelerated) and NVIDIA Volta-era GPUs.
+* `"omp;cuda-nvcxx"` - compiles for the CPU backend and NVIDIA GPUs using `nvc++`.
 
 ### Offloading C++ standard parallelism
 
 See [here](stdpar.md) for details on how to offload C++ standard STL algorithms using AdaptiveCpp.
 
-## All the flags: acpp --help
+## All the flags: `acpp --help`
 
-The full excerpt from `acpp --help` follows below. Note the options can also be set via environment variables or corresponding CMake options. Default values can be set in the `/acpp/install/path/etc/AdaptiveCpp/*.json` files.
+The full output obtained when running `acpp --help` is provided below. Note that the options can also be set via environment variables or the corresponding CMake options. Default values can be set in the `/acpp/install/path/etc/AdaptiveCpp/*.json` files.
 ```
-acpp [AdaptiveCpp compilation driver], Copyright (C) 2018-2023 Aksel Alpay and the AdaptiveCpp project
-  AdaptiveCpp version: 23.10.0+git.2d0c6b6f.20240226.branch.develop
+acpp [AdaptiveCpp compilation driver], Copyright (C) 2018-2024 Aksel Alpay and the AdaptiveCpp project
+  AdaptiveCpp version: 24.06.0+git.8cf7a902.20241001.branch.develop
   Installation root: /install/path
   Plugin LLVM version: <version>, can accelerate CPU: <bool>
   Available runtime backends:
@@ -92,7 +97,7 @@ Options are:
   [can also be set with environment variable: ACPP_PLATFORM=<value>]
   [default value provided by field 'default-platform' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
-  (deprecated) The platform that hipSYCL should target. Valid values:
+  (deprecated) The platform that AdaptiveCpp should target. Valid values:
     * cuda: Target NVIDIA CUDA GPUs
     * rocm: Target AMD GPUs running on the ROCm platform
     * cpu: Target only CPUs
@@ -102,8 +107,8 @@ Options are:
   [default value provided by field 'default-clang' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
   The path to the clang executable that should be used for compilation
-    (Note: *must* be compatible with the clang version that the 
-     hipSYCL clang plugin was compiled against!)
+    (Note: *must* be compatible with the clang version that the
+     AdaptiveCpp clang plugin was compiled against!)
 
 --acpp-nvcxx=<value>
   [can also be set with environment variable: ACPP_NVCXX=<value>]
@@ -197,7 +202,7 @@ Options are:
   [default value provided by field 'default-config-file-dir' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
   Select an alternative path for the config files containing the default AdaptiveCpp settings.
-    It is normally not necessary for the user to change this setting. 
+    It is normally not necessary for the user to change this setting.
 
 --acpp-targets=<value>
   [can also be set with environment variable: ACPP_TARGETS=<value>]
@@ -211,11 +216,11 @@ Options are:
                                    Uses Boost.Fiber for nd_range parallel_for support.
                - omp.accelerated: Uses clang as host compiler to enable compiler support
                                   for nd_range parallel_for (see --acpp-use-accelerated-cpu).
-      * cuda - CUDA backend 
+      * cuda - CUDA backend
                Requires specification of targets of the form sm_XY,
                e.g. sm_70 for Volta, sm_60 for Pascal
                Backend Flavors:
-               - cuda.explicit-multipass: CUDA backend in explicit multipass mode 
+               - cuda.explicit-multipass: CUDA backend in explicit multipass mode
                                           (see --acpp-explicit-multipass)
                - cuda.integrated-multipass: Force CUDA backend to operate in integrated
                                            multipass mode.
@@ -259,8 +264,8 @@ Options are:
   [can also be set by setting environment variable ACPP_DRYRUN to any value other than false|off|0 ]
   [default value provided by field 'default-is-dryrun' in JSON files from directories: ['/install/path/etc/AdaptiveCpp'].]
   [current value: NOT SET]
-  If set, only shows compilation commands that would be executed, 
-  but does not actually execute it. 
+  If set, only shows compilation commands that would be executed,
+  but does not actually execute it.
 
 --acpp-explicit-multipass
   [can also be set by setting environment variable ACPP_EXPLICIT_MULTIPASS to any value other than false|off|0 ]
@@ -269,7 +274,7 @@ Options are:
   If set, executes device passes as separate compiler invocation and lets AdaptiveCpp control embedding device
   images into the host binary. This allows targeting multiple backends simultaneously that might otherwise be
   incompatible. In this mode, source code level interoperability may not be supported in the host pass.
-  For example, you cannot use the CUDA kernel launch syntax[i.e. kernel <<< ... >>> (...)] in this mode. 
+  For example, you cannot use the CUDA kernel launch syntax[i.e. kernel <<< ... >>> (...)] in this mode.
 
 --acpp-save-temps
   [can also be set by setting environment variable ACPP_SAVE_TEMPS to any value other than false|off|0 ]
@@ -309,15 +314,14 @@ Options are:
 Any other options will be forwarded to the compiler.
 
 Note: Command line arguments take precedence over environment variables.
-
 ```
 
 ## Using the CMake integration
-Setting up a project using the AdaptiveCpp CMake integration is quite straight forward.
-The main points are adding `find_package(AdaptiveCpp REQUIRED)` and after defining the targets to build, adding `add_sycl_to_target(TARGET <target_name>)` to have the compilation handled by the AdaptiveCpp toolchain.
-See the [example cmake project](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/examples/CMakeLists.txt).
 
-A typical configure command line looks like this: `cmake .. -DAdaptiveCpp_DIR=/acpp/install/dir/lib/cmake/AdaptiveCpp -DACPP_TARGETS="<targets>"`.
-`ACPP_TARGETS` has to be set either as environment variable or on the command line for the `find_package` call to succeed. See the documentation of this flag above.
+Setting up a project using the AdaptiveCpp CMake integration is fairly straightforward.
+The main points are adding `find_package(AdaptiveCpp REQUIRED)` and, after defining the targets to build, adding `add_sycl_to_target(TARGET <target_name>)` to have the compilation handled by the AdaptiveCpp toolchain (see the [example CMake project](https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/examples/CMakeLists.txt)).
+
+A typical configure command might look like this: `cmake .. -DAdaptiveCpp_DIR=/acpp/install/dir/lib/cmake/AdaptiveCpp -DACPP_TARGETS="<targets>"`.
+`ACPP_TARGETS` has to be set either as an environment variable or through the command line for the `find_package` call to succeed. See the documentation of this flag above.
 
 If the accelerated CPU flow has been built, `-DACPP_USE_ACCELERATED_CPU=ON/OFF` can be used to override whether `omp` should refer to the `omp.library-only` or `omp.accelerated` compilation flow.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index c6de126bf..024d13cff 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,10 +1,10 @@
-cmake_minimum_required (VERSION 3.5)
+cmake_minimum_required(VERSION 3.10)
 
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-project(opensycl-examples)
+project(adaptivecpp-examples)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/../cmake)
 
diff --git a/examples/bruteforce_nbody/CMakeLists.txt b/examples/bruteforce_nbody/CMakeLists.txt
index 48ef9d39b..c2aaa47cd 100644
--- a/examples/bruteforce_nbody/CMakeLists.txt
+++ b/examples/bruteforce_nbody/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_executable(bruteforce_nbody bruteforce_nbody.cpp)
 add_sycl_to_target(TARGET bruteforce_nbody SOURCES bruteforce_nbody.cpp)
 install(TARGETS bruteforce_nbody
-        RUNTIME DESTINATION share/hipSYCL/examples/)
+        RUNTIME DESTINATION share/AdaptiveCpp/examples/)
diff --git a/examples/bruteforce_nbody/bruteforce_nbody.cpp b/examples/bruteforce_nbody/bruteforce_nbody.cpp
index 2a97e55a7..37755ac85 100644
--- a/examples/bruteforce_nbody/bruteforce_nbody.cpp
+++ b/examples/bruteforce_nbody/bruteforce_nbody.cpp
@@ -14,6 +14,7 @@
 #include <vector>
 #include <fstream>
 #include <cstdlib>
+#include <chrono>
 #include "bruteforce_nbody.hpp"
 #include "model.hpp"
 
@@ -21,7 +22,7 @@
 arithmetic_type mirror_position(const arithmetic_type mirror_pos,
                                 const arithmetic_type position)
 {
-  arithmetic_type delta = cl::sycl::fabs(mirror_pos - position);
+  arithmetic_type delta = sycl::fabs(mirror_pos - position);
   return (position <= mirror_pos) ?
         mirror_pos + delta : mirror_pos - delta;
 }
@@ -30,15 +31,10 @@ int get_num_iterations_per_output_step()
 {
   char* val = std::getenv("NBODY_ITERATIONS_PER_OUTPUT");
   if(!val)
-    return 1;
+    return 10;
   return std::stoi(val);
 }
 
-template<class T, int Dim>
-using local_accessor =
-  sycl::accessor<T,Dim,
-                 sycl::access::mode::read_write,
-                 sycl::access::target::local>;
 
 int main()
 {
@@ -102,16 +98,15 @@ int main()
                    velocities_cloud2.begin(),
                    velocities_cloud2.end());
 
-  auto particles_buffer =
-      sycl::buffer<particle_type, 1>{particles.data(), particles.size()};
-  auto velocities_buffer =
-      sycl::buffer<vector_type, 1>{velocities.data(), velocities.size()};
-  auto forces_buffer =
-      sycl::buffer<vector_type, 1>{sycl::range<1>{particles.size()}};
-
-  sycl::default_selector selector;
-  sycl::queue q{selector};
+  sycl::queue q{sycl::default_selector_v, sycl::property::queue::in_order{}};
+  
+  particle_type* particles_buffer = sycl::malloc_device<particle_type>(particles.size(), q);
+  vector_type* velocities_buffer = sycl::malloc_device<vector_type>(velocities.size(), q);
+  vector_type* forces_buffer = sycl::malloc_device<vector_type>(particles.size(), q);
 
+  q.copy(particles.data(), particles_buffer, particles.size());
+  q.copy(velocities.data(), velocities_buffer, particles.size());
+  
   auto execution_range = sycl::nd_range<1>{
       sycl::range<1>{((num_particles + local_size - 1) / local_size) * local_size},
       sycl::range<1>{local_size}
@@ -119,44 +114,51 @@ int main()
 
 
   std::ofstream outputfile{"output.txt"};
+
+  const std::size_t num_particles = particles.size();
+
+  auto start_time = std::chrono::high_resolution_clock::now();
+  double total_time = 0.0;
+
   for(std::size_t t = 0; t < num_timesteps; ++t)
   {
     // Submit force calculation
     q.submit([&](sycl::handler& cgh){
-      auto particles_access =
-          particles_buffer.get_access<sycl::access::mode::read>(cgh);
-      auto forces_access =
-          forces_buffer.get_access<sycl::access::mode::discard_write>(cgh);
 
-      auto scratch = local_accessor<particle_type, 1>{
+      auto scratch = sycl::local_accessor<particle_type, 1>{
         sycl::range<1>{local_size},
         cgh
       };
 
-      cgh.parallel_for<class force_calculation_kernel>(execution_range,
-                                                       [=](sycl::nd_item<1> tid){
-        const size_t global_id = tid.get_global_id().get(0);
-        const size_t local_id = tid.get_local_id().get(0);
-        const size_t num_particles = particles_access.get_range()[0];
+      cgh.parallel_for(execution_range,
+                      [=](sycl::nd_item<1> tid){
+        const std::size_t global_id = tid.get_global_id().get(0);
+        const std::size_t local_id = tid.get_local_id().get(0);
+        
         vector_type force{0.0f};
 
         const particle_type my_particle =
-            (global_id < num_particles) ? particles_access[global_id] : particle_type{0.0f};
+            (global_id < num_particles) ? particles_buffer[global_id] : particle_type{0.0f};
 
         for(size_t offset = 0; offset < num_particles; offset += local_size)
         {
           if(offset + local_id < num_particles)
-            scratch[local_id] = particles_access[offset + local_id];
+            scratch[local_id] = particles_buffer[offset + local_id];
           else
             scratch[local_id] = particle_type{0.0f};
-          tid.barrier();
+
+          sycl::group_barrier(tid.get_group());
 
           for(int i = 0; i < local_size; ++i)
           {
             const particle_type p = scratch[i];
             const vector_type p_direction = p.swizzle<0,1,2>();
+            // 3 flops
             const vector_type R = p_direction - my_particle.swizzle<0,1,2>();
-            // dot is not yet implemented
+            
+            // 6 flops (ignoring rsqrt, where we cannot quantify - this
+            //   will be a major source of the reported number being off
+            //   from peak)
             const arithmetic_type r_inv =
                 sycl::rsqrt(R.x()*R.x() + R.y()*R.y() + R.z()*R.z()
                                     + gravitational_softening);
@@ -164,99 +166,113 @@ int main()
             // Actually we just calculate the acceleration, not the
             // force. We only need the acceleration anyway.
             if(global_id != offset + i)
+              // 9 flops
               force += static_cast<arithmetic_type>(p.w()) * r_inv * r_inv * r_inv * R;
           }
 
-          tid.barrier();
+          sycl::group_barrier(tid.get_group());
         }
 
         if(global_id < num_particles)
-          forces_access[global_id] = force;
+          forces_buffer[global_id] = force;
       });
     });
 
     // Time integration
-    q.submit([&](cl::sycl::handler& cgh){
-      auto particles_access =
-          particles_buffer.get_access<sycl::access::mode::read_write>(cgh);
-      auto velocities_access =
-          velocities_buffer.get_access<sycl::access::mode::read_write>(cgh);
-      auto forces_access =
-          forces_buffer.get_access<sycl::access::mode::read>(cgh);
-      const arithmetic_type dt = ::dt;
-
-      cgh.parallel_for<class integration_kernel>(execution_range,
-                                                [=](sycl::nd_item<1> tid){
-        const size_t global_id = tid.get_global_id().get(0);
-        const size_t num_particles = particles_access.get_range().get(0);
+    q.parallel_for(execution_range,
+                   [=](sycl::nd_item<1> tid){
+      const size_t global_id = tid.get_global_id().get(0);
 
-        if(global_id < num_particles)
-        {
-          particle_type p = particles_access[global_id];
-          vector_type v = velocities_access[global_id];
-          const vector_type acceleration = forces_access[global_id];
-
-          // Bring v to the current state
-          v += acceleration * dt;
+      if(global_id < num_particles)
+      {
+        particle_type p = particles_buffer[global_id];
+        vector_type v = velocities_buffer[global_id];
+        const vector_type acceleration = forces_buffer[global_id];
 
-          // Update position
-          p.x() += v.x() * dt;
-          p.y() += v.y() * dt;
-          p.z() += v.z() * dt;
+        // Bring v to the current state
+        v += acceleration * dt;
 
-          // Reflect particle position and invert velocities
-          // if particles exit the simulation cube
-          if(static_cast<arithmetic_type>(p.x()) <= -half_cube_size)
-          {
-            v.x() = cl::sycl::fabs(v.x());
-            p.x() = mirror_position(-half_cube_size, p.x());
-          }
-          else if(static_cast<arithmetic_type>(p.x()) >= half_cube_size)
-          {
-            v.x() = -cl::sycl::fabs(v.x());
-            p.x() = mirror_position(half_cube_size, p.x());
-          }
+        // Update position
+        p.x() += v.x() * dt;
+        p.y() += v.y() * dt;
+        p.z() += v.z() * dt;
 
-          if(static_cast<arithmetic_type>(p.y()) <= -half_cube_size)
-          {
-            v.y() = cl::sycl::fabs(v.y());
-            p.y() = mirror_position(-half_cube_size, p.y());
-          }
-          else if(static_cast<arithmetic_type>(p.y()) >= half_cube_size)
-          {
-            v.y() = -cl::sycl::fabs(v.y());
-            p.y() = mirror_position(half_cube_size, p.y());
-          }
+        // Reflect particle position and invert velocities
+        // if particles exit the simulation cube
+        if(static_cast<arithmetic_type>(p.x()) <= -half_cube_size)
+        {
+          v.x() = sycl::fabs(v.x());
+          p.x() = mirror_position(-half_cube_size, p.x());
+        }
+        else if(static_cast<arithmetic_type>(p.x()) >= half_cube_size)
+        {
+          v.x() = -sycl::fabs(v.x());
+          p.x() = mirror_position(half_cube_size, p.x());
+        }
 
-          if(static_cast<arithmetic_type>(p.z()) <= -half_cube_size)
-          {
-            v.z() = cl::sycl::fabs(v.z());
-            p.z() = mirror_position(-half_cube_size, p.z());
-          }
-          else if(static_cast<arithmetic_type>(p.z()) >= half_cube_size)
-          {
-            v.z() = -cl::sycl::fabs(v.z());
-            p.z() = mirror_position(half_cube_size, p.z());
-          }
+        if(static_cast<arithmetic_type>(p.y()) <= -half_cube_size)
+        {
+          v.y() = sycl::fabs(v.y());
+          p.y() = mirror_position(-half_cube_size, p.y());
+        }
+        else if(static_cast<arithmetic_type>(p.y()) >= half_cube_size)
+        {
+          v.y() = -sycl::fabs(v.y());
+          p.y() = mirror_position(half_cube_size, p.y());
+        }
 
-          particles_access[global_id] = p;
-          velocities_access[global_id] = v;
+        if(static_cast<arithmetic_type>(p.z()) <= -half_cube_size)
+        {
+          v.z() = sycl::fabs(v.z());
+          p.z() = mirror_position(-half_cube_size, p.z());
         }
-      });
+        else if(static_cast<arithmetic_type>(p.z()) >= half_cube_size)
+        {
+          v.z() = -sycl::fabs(v.z());
+          p.z() = mirror_position(half_cube_size, p.z());
+        }
+
+        particles_buffer[global_id] = p;
+        velocities_buffer[global_id] = v;
+      }
     });
+  
 
     if(t % iterations_per_output == 0)
     {
-      std::cout << "Writing output..."  << std::endl;
-      auto particle_positions =
-          particles_buffer.get_access<sycl::access::mode::read>();
+      // This wait is only needed for the performance measurement.
+      // We don't need it for the algorithm itself - but we don't want
+      // to include the data transfer time in the measurement.
+      q.wait();
+      auto stop_time = std::chrono::high_resolution_clock::now();
+      total_time +=
+          std::chrono::duration_cast<std::chrono::nanoseconds>(stop_time - start_time)
+              .count() *
+          1.e-9;
+      
+      const std::size_t flops_per_iter =
+          18 * num_particles * num_particles + 12 * num_particles;
+      std::cout << "Overall average performance: "
+                << 1.e-9 * flops_per_iter * (t + 1) / total_time << " GFlops"
+                << std::endl;
+
+      q.copy(particles_buffer, particles.data(), particles.size()).wait();
 
+      std::cout << "Writing output..."  << std::endl;
       for(std::size_t i = 0; i < num_particles; ++i)
       {
-        outputfile << particle_positions[i].x() << " "
-                   << particle_positions[i].y() << " "
-                   << particle_positions[i].z() << " " << i << std::endl;
+        outputfile << particles[i].x() << " "
+                   << particles[i].y() << " "
+                   << particles[i].z() << " " << i << std::endl;
       }
+
+      // start again for next iteration
+      start_time = std::chrono::high_resolution_clock::now();
     }
   }
+
+  q.wait();
+  sycl::free(particles_buffer, q);
+  sycl::free(velocities_buffer, q);
+  sycl::free(forces_buffer, q);
 }
diff --git a/examples/bruteforce_nbody/bruteforce_nbody.hpp b/examples/bruteforce_nbody/bruteforce_nbody.hpp
index 1e119eb36..2ba6e6b79 100644
--- a/examples/bruteforce_nbody/bruteforce_nbody.hpp
+++ b/examples/bruteforce_nbody/bruteforce_nbody.hpp
@@ -12,8 +12,7 @@
 #ifndef BRUTEFORCE_NBODY_HPP
 #define BRUTEFORCE_NBODY_HPP
 
-#include <CL/sycl.hpp>
-using namespace cl;
+#include <sycl/sycl.hpp>
 
 using arithmetic_type = float;
 using vector_type = sycl::vec<arithmetic_type, 3>;
diff --git a/include/hipSYCL/algorithms/algorithm.hpp b/include/hipSYCL/algorithms/algorithm.hpp
index dfd4a6dee..37798651e 100644
--- a/include/hipSYCL/algorithms/algorithm.hpp
+++ b/include/hipSYCL/algorithms/algorithm.hpp
@@ -15,15 +15,23 @@
 #include <iterator>
 #include <limits>
 #include <type_traits>
+#include <cstring>
 #include "hipSYCL/sycl/libkernel/accessor.hpp"
 #include "hipSYCL/sycl/libkernel/atomic_builtins.hpp"
 #include "hipSYCL/sycl/libkernel/memory.hpp"
 #include "hipSYCL/sycl/libkernel/functional.hpp"
+#include "hipSYCL/sycl/detail/namespace_compat.hpp"
 #include "hipSYCL/sycl/event.hpp"
 #include "hipSYCL/sycl/queue.hpp"
+#include "merge/merge.hpp"
+#include "scan/scan.hpp"
 #include "util/traits.hpp"
 #include "hipSYCL/algorithms/util/allocation_cache.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
+#include "hipSYCL/algorithms/sort/bitonic_sort.hpp"
+#include "hipSYCL/algorithms/merge/merge.hpp"
+#include "hipSYCL/algorithms/scan/scan.hpp"
+
 
 namespace hipsycl::algorithms {
 
@@ -74,10 +82,11 @@ inline bool should_use_memset(const sycl::device& dev) {
 
 template <class ForwardIt, class UnaryFunction2>
 sycl::event for_each(sycl::queue &q, ForwardIt first, ForwardIt last,
-                     UnaryFunction2 f) {
+                     UnaryFunction2 f,
+                     const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
+  return q.parallel_for(sycl::range{std::distance(first, last)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -85,15 +94,16 @@ sycl::event for_each(sycl::queue &q, ForwardIt first, ForwardIt last,
                         });
 }
 
-template<class ForwardIt, class Size, class UnaryFunction2>
-sycl::event for_each_n(sycl::queue& q,
-                    ForwardIt first, Size n, UnaryFunction2 f) {
+template <class ForwardIt, class Size, class UnaryFunction2>
+sycl::event for_each_n(sycl::queue &q, ForwardIt first, Size n,
+                       UnaryFunction2 f,
+                       const std::vector<sycl::event> &deps = {}) {
   if(n <= 0)
     // sycl::event{} represents a no-op that is always finished.
     // This means it does not respect prior tasks in the task graph!
     // TODO Is this okay? Can we defer this responsibility to the user?
     return sycl::event{};
-  return q.parallel_for(sycl::range{static_cast<size_t>(n)},
+  return q.parallel_for(sycl::range{static_cast<size_t>(n)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -102,12 +112,12 @@ sycl::event for_each_n(sycl::queue& q,
 }
 
 template <class ForwardIt1, class ForwardIt2, class UnaryOperation>
-sycl::event transform(sycl::queue& q,
-                     ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 d_first,
-                     UnaryOperation unary_op) {
+sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
+                      ForwardIt2 d_first, UnaryOperation unary_op,
+                      const std::vector<sycl::event> &deps = {}) {
   if(first1 == last1)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first1, last1)},
+  return q.parallel_for(sycl::range{std::distance(first1, last1)}, deps,
                         [=](sycl::id<1> id) {
                           auto input = first1;
                           auto output = d_first;
@@ -121,10 +131,11 @@ template <class ForwardIt1, class ForwardIt2, class ForwardIt3,
           class BinaryOperation>
 sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
                       ForwardIt2 first2, ForwardIt3 d_first,
-                      BinaryOperation binary_op) {
+                      BinaryOperation binary_op,
+                      const std::vector<sycl::event> &deps = {}) {
   if(first1 == last1)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first1, last1)},
+  return q.parallel_for(sycl::range{std::distance(first1, last1)}, deps,
                         [=](sycl::id<1> id) {
                           auto input1 = first1;
                           auto input2 = first2;
@@ -138,7 +149,7 @@ sycl::event transform(sycl::queue &q, ForwardIt1 first1, ForwardIt1 last1,
 
 template <class ForwardIt1, class ForwardIt2>
 sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
-                 ForwardIt2 d_first) {
+                 ForwardIt2 d_first, const std::vector<sycl::event> &deps = {}) {
   
   auto size = std::distance(first, last);
   if(size == 0)
@@ -151,9 +162,9 @@ sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
       std::is_same_v<value_type1, value_type2> &&
       util::is_contiguous<ForwardIt1>() && util::is_contiguous<ForwardIt2>() &&
       detail::should_use_memcpy(q.get_device())) {
-    return q.memcpy(&(*d_first), &(*first), size * sizeof(value_type1));
+    return q.memcpy(&(*d_first), &(*first), size * sizeof(value_type1), deps);
   } else {
-    return q.parallel_for(sycl::range{size},
+    return q.parallel_for(sycl::range{size}, deps,
                           [=](sycl::id<1> id) {
                             auto input = first;
                             auto output = d_first;
@@ -164,39 +175,88 @@ sycl::event copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
   }
 }
 
-
-template<class ForwardIt1, class ForwardIt2, class UnaryPredicate >
-sycl::event copy_if(sycl::queue& q,
-                    ForwardIt1 first, ForwardIt1 last,
-                    ForwardIt2 d_first,
-                    UnaryPredicate pred) {
-  if(first == last)
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate>
+sycl::event copy_if(sycl::queue &q, util::allocation_group &scratch_allocations,
+                    ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+                    UnaryPredicate pred,
+                    std::size_t *num_elements_copied = nullptr,
+                    const std::vector<sycl::event> &deps = {}) {
+  if(first == last) {
+    if(num_elements_copied)
+      *num_elements_copied = 0;
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
-                        [=](sycl::id<1> id) {
-                          auto input = first;
-                          auto output = d_first;
-                          std::advance(input, id[0]);
-                          std::advance(output, id[0]);
-                          auto input_v = *input;
-                          if(pred(input_v))
-                            *output = input_v;
-                        });
+  }
+
+  // TODO: We could optimize by switching between 32/64 bit types
+  // depending on problem size
+  using ScanT = std::size_t;
+
+  auto generator = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size) {
+    if(effective_global_id >= problem_size)
+      return ScanT{0};
+
+    ForwardIt1 it = first;
+    std::advance(it, effective_global_id);
+    if(pred(*it))
+      return ScanT{1};
+
+    return ScanT{0};
+  };
+
+  auto result_processor = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size,
+                       auto value) {
+    if (effective_global_id < problem_size) {
+      ForwardIt2 output = d_first;
+      ForwardIt1 input = first;
+      std::advance(input, effective_global_id);
+      std::advance(output, value);
+
+      bool needs_copy = false;
+
+      if(effective_global_id < problem_size) {
+        auto input_value = *input;
+        needs_copy = pred(input_value);
+        if(needs_copy)
+          *output = *input;
+      }
+
+      if (effective_global_id == problem_size - 1 && num_elements_copied) {
+        ScanT inclusive_scan_result = value;
+        // We did an exclusive scan, so if the last element also was copied,
+        // we need to add that.
+        if(needs_copy)
+          ++inclusive_scan_result;
+        
+        *num_elements_copied = static_cast<std::size_t>(inclusive_scan_result);
+      }
+    }
+  };
+
+  std::size_t problem_size = std::distance(first, last);
+
+  constexpr bool is_inclusive_scan = false;
+  return scanning::generate_scan_process<is_inclusive_scan, ScanT>(
+      q, scratch_allocations, problem_size, sycl::plus<>{},
+      ScanT{0}, generator, result_processor, deps);
 }
 
-template<class ForwardIt1, class Size, class ForwardIt2 >
-sycl::event copy_n(sycl::queue& q, ForwardIt1 first, Size count, ForwardIt2 result) {
+template <class ForwardIt1, class Size, class ForwardIt2>
+sycl::event copy_n(sycl::queue &q, ForwardIt1 first, Size count,
+                   ForwardIt2 result,
+                   const std::vector<sycl::event> &deps = {}) {
   if(count <= 0)
     return sycl::event{};
 
   auto last = first;
   std::advance(last, count);
-  return copy(q, first, last, result);
+  return copy(q, first, last, result, deps);
 }
 
 template <class ForwardIt, class T>
 sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
-                 const T &value) {
+                 const T &value, const std::vector<sycl::event> &deps = {}) {
   auto size = std::distance(first, last);
   if(size == 0)
     return sycl::event{};
@@ -204,7 +264,7 @@ sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
   using value_type = typename std::iterator_traits<ForwardIt>::value_type;
 
   auto invoke_kernel = [&]() -> sycl::event{
-    return q.parallel_for(sycl::range{size},
+    return q.parallel_for(sycl::range{size}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -219,7 +279,7 @@ sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
     if (detail::all_bytes_equal(value, equal_byte) &&
         detail::should_use_memset(q.get_device())) {
       return q.memset(&(*first), static_cast<int>(equal_byte),
-                      size * sizeof(T));
+                      size * sizeof(T), deps);
     } else {
       return invoke_kernel();
     }
@@ -230,21 +290,22 @@ sycl::event fill(sycl::queue &q, ForwardIt first, ForwardIt last,
 
 template<class ForwardIt, class Size, class T >
 sycl::event fill_n(sycl::queue& q,
-                  ForwardIt first, Size count, const T& value ) {
+                  ForwardIt first, Size count, const T& value,
+                  const std::vector<sycl::event> &deps = {}) {
   if(count <= Size{0})
     return sycl::event{};
   
   auto last = first;
   std::advance(last, count);
-  return fill(q, first, last, value);
+  return fill(q, first, last, value, deps);
 }
 
-
-template<class ForwardIt, class Generator >
-sycl::event generate(sycl::queue& q, ForwardIt first, ForwardIt last, Generator g) {
+template <class ForwardIt, class Generator>
+sycl::event generate(sycl::queue &q, ForwardIt first, ForwardIt last,
+                     Generator g, const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
+  return q.parallel_for(sycl::range{std::distance(first, last)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -252,12 +313,12 @@ sycl::event generate(sycl::queue& q, ForwardIt first, ForwardIt last, Generator
                         });
 }
 
-template<class ForwardIt, class Size, class Generator >
-sycl::event generate_n(sycl::queue& q, ForwardIt first,
-                      Size count, Generator g) {
+template <class ForwardIt, class Size, class Generator>
+sycl::event generate_n(sycl::queue &q, ForwardIt first, Size count, Generator g,
+                       const std::vector<sycl::event> &deps = {}) {
   if(count <= 0)
     return sycl::event{};
-  return q.parallel_for(sycl::range{static_cast<size_t>(count)},
+  return q.parallel_for(sycl::range{static_cast<size_t>(count)}, deps,
                         [=](sycl::id<1> id) {
                           auto it = first;
                           std::advance(it, id[0]);
@@ -265,37 +326,38 @@ sycl::event generate_n(sycl::queue& q, ForwardIt first,
                         });
 }
 
-template<class ForwardIt, class T>
-sycl::event replace(sycl::queue& q, ForwardIt first, ForwardIt last,
-                    const T& old_value, const T& new_value) {
+template <class ForwardIt, class T>
+sycl::event replace(sycl::queue &q, ForwardIt first, ForwardIt last,
+                    const T &old_value, const T &new_value,
+                    const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
-  return for_each(q, first, last, [=](auto& x){
+  return for_each(q, first, last,[=](auto& x){
     if(x == old_value)
       x = new_value;
-  });
+  }, deps);
 }
 
-template<class ForwardIt,
-         class UnaryPredicate, class T >
-sycl::event replace_if(sycl::queue& q, ForwardIt first, ForwardIt last,
-                      UnaryPredicate p, const T& new_value) {
+template <class ForwardIt, class UnaryPredicate, class T>
+sycl::event replace_if(sycl::queue &q, ForwardIt first, ForwardIt last,
+                       UnaryPredicate p, const T &new_value,
+                       const std::vector<sycl::event> &deps = {}) {
   if(first == last)
     return sycl::event{};
   return for_each(q, first, last, [=](auto& x){
     if(p(x))
       x = new_value;
-  });
+  }, deps);
 }
 
-template <class ForwardIt1, class ForwardIt2,
-          class UnaryPredicate, class T>
-sycl::event replace_copy_if(
-    sycl::queue& q, ForwardIt1 first,
-    ForwardIt1 last, ForwardIt2 d_first, UnaryPredicate p, const T &new_value) {
+template <class ForwardIt1, class ForwardIt2, class UnaryPredicate, class T>
+sycl::event replace_copy_if(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                            ForwardIt2 d_first, UnaryPredicate p,
+                            const T &new_value,
+                            const std::vector<sycl::event> &deps = {}) {
   if (first == last)
     return sycl::event{};
-  return q.parallel_for(sycl::range{std::distance(first, last)},
+  return q.parallel_for(sycl::range{std::distance(first, last)}, deps,
                         [=](sycl::id<1> id) {
                           auto input = first;
                           auto output = d_first;
@@ -310,27 +372,27 @@ sycl::event replace_copy_if(
 }
 
 template <class ForwardIt1, class ForwardIt2, class T>
-sycl::event
-replace_copy(sycl::queue& q,
-             ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
-             const T &old_value, const T &new_value) {
+sycl::event replace_copy(sycl::queue &q, ForwardIt1 first, ForwardIt1 last,
+                         ForwardIt2 d_first, const T &old_value,
+                         const T &new_value,
+                         const std::vector<sycl::event> &deps = {}) {
   if (first == last)
     return sycl::event{};
   return replace_copy_if(
       q, first, last, d_first, [=](const auto &x) { return x == old_value; },
-      new_value);
+      new_value, deps);
 }
 
 // Need transform_reduce functionality for find etc, so forward
 // declare here.
-template <class ForwardIt, class T, class BinaryReductionOp,
+/*template <class ForwardIt, class T, class BinaryReductionOp,
           class UnaryTransformOp>
 sycl::event
 transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                  ForwardIt first, ForwardIt last, T* out, T init,
-                 BinaryReductionOp reduce, UnaryTransformOp transform);
+                 BinaryReductionOp reduce, UnaryTransformOp transform,
+                 const std::vector<sycl::event> &deps);
 
-/*
 // Need transform_reduce functionality for find etc, so forward
 // declare here.
 template <class ForwardIt, class T, class BinaryReductionOp,
@@ -366,7 +428,8 @@ using early_exit_flag_t = int;
 template <class Predicate>
 sycl::event early_exit_for_each(sycl::queue &q, std::size_t problem_size,
                                 early_exit_flag_t *output_has_exited_early,
-                                Predicate should_exit) {
+                                Predicate should_exit,
+                                const std::vector<sycl::event> &deps = {}) {
   
   std::size_t group_size = 128;
 
@@ -398,7 +461,7 @@ sycl::event early_exit_for_each(sycl::queue &q, std::size_t problem_size,
       });
     };
 
-  auto evt = q.single_task([=](){*output_has_exited_early = false;});
+  auto evt = q.single_task(deps, [=](){*output_has_exited_early = false;});
   return q.parallel_for(sycl::nd_range<1>{dispatched_global_size, group_size}, evt,
                         kernel);
 }
@@ -408,7 +471,7 @@ sycl::event early_exit_for_each(sycl::queue &q, std::size_t problem_size,
 template <class ForwardIt, class UnaryPredicate>
 sycl::event all_of(sycl::queue &q,
                    ForwardIt first, ForwardIt last, detail::early_exit_flag_t* out,
-                   UnaryPredicate p) {
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
@@ -417,7 +480,7 @@ sycl::event all_of(sycl::queue &q,
                                        auto it = first;
                                        std::advance(it, idx[0]);
                                        return !p(*it);
-                                     });
+                                     }, deps);
   return q.single_task(evt, [=](){
     *out = static_cast<detail::early_exit_flag_t>(!(*out));
   });
@@ -426,7 +489,7 @@ sycl::event all_of(sycl::queue &q,
 template <class ForwardIt, class UnaryPredicate>
 sycl::event any_of(sycl::queue &q,
                    ForwardIt first, ForwardIt last, detail::early_exit_flag_t* out,
-                   UnaryPredicate p) {
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
@@ -435,23 +498,64 @@ sycl::event any_of(sycl::queue &q,
                                        auto it = first;
                                        std::advance(it, idx[0]);
                                        return p(*it);
-                                     });
+                                     }, deps);
 }
 
 template <class ForwardIt, class UnaryPredicate>
 sycl::event none_of(sycl::queue &q,
                    ForwardIt first, ForwardIt last, detail::early_exit_flag_t* out,
-                   UnaryPredicate p) {
+                   UnaryPredicate p, const std::vector<sycl::event>& deps = {}) {
   std::size_t problem_size = std::distance(first, last);
   if(problem_size == 0)
     return sycl::event{};
   
-  auto evt = any_of(q, first, last, out, p);
+  auto evt = any_of(q, first, last, out, p, deps);
   return q.single_task(evt, [=](){
     *out = static_cast<detail::early_exit_flag_t>(!(*out));
   });
 }
 
+template <class RandomIt, class Compare>
+sycl::event sort(sycl::queue &q, RandomIt first, RandomIt last,
+                 Compare comp = std::less<>{},
+                 const std::vector<sycl::event>& deps = {}) {
+  std::size_t problem_size = std::distance(first, last);
+  if(problem_size == 0)
+    return sycl::event{};
+
+  return sorting::bitonic_sort(q, first, last, comp, deps);
+}
+
+template< class ForwardIt1, class ForwardIt2,
+          class ForwardIt3, class Compare >
+sycl::event merge(sycl::queue& q,
+                  util::allocation_group &scratch_allocations,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp = std::less<>{},
+                  const std::vector<sycl::event>& deps = {}) {
+
+  std::size_t size1 =  std::distance(first1, last1);
+  std::size_t size2 =  std::distance(first2, last2);
+
+  if(size1 == 0)
+    return copy(q, first2, last2, d_first);
+  if(size2 == 0)
+    return copy(q, first1, last1, d_first);
+
+  std::size_t problem_size = size1 + size2;
+  if(problem_size == 0)
+    return sycl::event{};
+
+  if (q.get_device().get_backend() == sycl::backend::omp)
+    return merging::segmented_merge(q, first1, last1, first2, last2, d_first,
+                                    comp, 128, deps);
+  else
+    return merging::hierarchical_hybrid_merge(q, scratch_allocations, first1,
+                                              last1, first2, last2, d_first,
+                                              comp, 128, deps);
+}
+
 }
 
 #endif
diff --git a/include/hipSYCL/algorithms/binary_search/index_search.hpp b/include/hipSYCL/algorithms/binary_search/index_search.hpp
new file mode 100644
index 000000000..701278f73
--- /dev/null
+++ b/include/hipSYCL/algorithms/binary_search/index_search.hpp
@@ -0,0 +1,74 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_INDEX_SEARCH_HPP
+#define ACPP_ALGORITHMS_INDEX_SEARCH_HPP
+
+#include <type_traits>
+
+namespace hipsycl::algorithms::binary_searching {
+
+// Same as std::lower_bound, but works in terms of indices
+template< class IndexT, class T, class DataGetter,
+          class Compare >
+constexpr IndexT index_lower_bound( IndexT first, IndexT last,
+                                    const T& value, DataGetter load, Compare comp ) {
+  using SignedIndexT = typename std::make_signed<IndexT>::type;
+
+  IndexT current;
+  SignedIndexT count, step;
+  count = last - first;
+
+  while (count > 0) {
+    current = first;
+    step = count / 2;
+    current += step;
+
+    if (comp(load(current), value)) {
+      first = ++current;
+      count -= step + 1;
+    } else
+      count = step;
+  }
+
+  return first;
+}
+
+
+// Same as std::upper_bound, but works in terms of indices
+template< class IndexT, class T, class DataGetter,
+          class Compare >
+constexpr IndexT index_upper_bound( IndexT first, IndexT last,
+                                    const T& value, DataGetter load, Compare comp ) {
+  using SignedIndexT = typename std::make_signed<IndexT>::type;
+
+  IndexT current;
+  SignedIndexT count, step;
+  count = last - first;
+
+  while (count > 0) {
+    current = first;
+    step = count / 2;
+    current += step;
+
+    if (!comp(value, load(current))) {
+      first = ++current;
+      count -= step + 1;
+    } else
+      count = step;
+  }
+
+  return first;
+}
+
+}
+
+#endif
diff --git a/include/hipSYCL/algorithms/merge/merge.hpp b/include/hipSYCL/algorithms/merge/merge.hpp
new file mode 100644
index 000000000..76652f77d
--- /dev/null
+++ b/include/hipSYCL/algorithms/merge/merge.hpp
@@ -0,0 +1,313 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_MERGE_HPP
+#define ACPP_ALGORITHMS_MERGE_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+#include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/sycl/libkernel/nd_item.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
+
+#include "../sort/bitonic_sort.hpp"
+#include "merge_path.hpp"
+
+namespace hipsycl::algorithms::merging {
+
+namespace detail {
+
+template <class ForwardIt1, class ForwardIt2, class OutputIt, class Compare, class Size>
+void sequential_merge(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
+                      ForwardIt2 last2, OutputIt out, Compare comp, Size max_num_merged) {
+
+  auto initial_out = out;
+  auto copy_remaining = [&](auto first, auto last) {
+    for (; first != last && (std::distance(initial_out, out) < max_num_merged);
+         ++first, ++out)
+      *out = *first;
+  };
+
+  for (; first1 != last1 && (std::distance(initial_out, out) < max_num_merged);
+       ++out) {
+    if(first2 == last2) {
+      copy_remaining(first1, last1);
+      return;
+    } else {
+      auto f1 = *first1;
+      auto f2 = *first2;
+      if(comp(f1, f2)) {
+        *out = f1;
+        ++first1;
+      } else {
+        *out = f2;
+        ++first2;
+      }
+    }
+  }
+  copy_remaining(first2, last2);
+}
+
+
+/// Decomposes the problem into N independent merges of given size, and
+/// then runs sequential merge on them. This might be a good strategy on CPU.
+///
+/// Precondition: distance(fist1, last1) > 0 && distance(first2, last2) > 0.
+/// Otherwise we cannot run the merge path algorithm for decomposing the merge.
+template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
+void segmented_merge(
+    RandomIt1 first1, RandomIt1 last1, RandomIt2 first2, RandomIt2 last2,
+    OutputIt out, Compare comp, std::size_t segment_index,
+    std::size_t segment_chunk_size) {
+
+  std::size_t p1 = 0;
+  std::size_t p2 = 0;
+
+  merge_path::nth_independent_merge_begin(first1, last1, first2, last2, comp,
+                                          segment_index,
+                                          segment_chunk_size, p1, p2);
+
+  auto chunk_first1 = first1;
+  auto chunk_first2 = first2;
+
+  std::advance(chunk_first1, p1);
+  std::advance(chunk_first2, p2);
+
+  auto chunk_last1 = chunk_first1;
+  auto chunk_last2 = chunk_first2;
+
+  std::advance(chunk_last1, std::min(segment_chunk_size,
+                              std::distance(first1, last1) - p1));
+  std::advance(chunk_last2, std::min(segment_chunk_size,
+                              std::distance(first2, last2) - p2));
+
+  std::size_t chunk_out_offset = segment_index * segment_chunk_size;
+  auto chunk_out = out;
+  std::advance(chunk_out, chunk_out_offset);
+
+  sequential_merge(chunk_first1, chunk_last1, chunk_first2, chunk_last2,
+                    chunk_out, comp, segment_chunk_size);
+}
+
+template <class RandomIt1, class RandomIt2, class Compare,
+          class IndexT>
+void store_segment_begin(RandomIt1 first1, RandomIt1 last1, RandomIt2 first2,
+                         RandomIt2 last2, Compare comp,
+                         IndexT segment_index,
+                         IndexT segment_size, 
+                         IndexT* first_out1, IndexT* first_out2,
+                         std::size_t offset = 0 // Will be added to result
+                         ) {
+
+  auto problem_size1 = std::distance(first1, last1);
+  auto problem_size2 = std::distance(first2, last2);
+
+  if(problem_size1 == 0) {
+    first_out1[segment_index] = offset + 0;
+    first_out2[segment_index] = offset + segment_index * segment_size;
+  } else if(problem_size2 == 0) {
+    first_out2[segment_index] = offset + 0;
+    first_out1[segment_index] = offset + segment_index * segment_size;
+  } else {
+
+    IndexT p1 = 0;
+    IndexT p2 = 0;
+
+    merge_path::nth_independent_merge_begin(first1, last1, first2, last2, comp,
+                                            segment_index,
+                                            segment_size, p1, p2);
+
+    first_out1[segment_index] = p1 + offset;
+    first_out2[segment_index] = p2 + offset;
+  }
+}
+
+template <class RandomIt1, class RandomIt2, class OutputIt, class IndexT,
+          class Group, class Compare>
+void segment_merge_by_group_sort(
+    Group grp, // SYCL group object. Group size must correspond to segment size,
+               // grp id must correspond to segment index.
+    RandomIt1 first1, // Iterators describing the *whole* merge range, not just
+                      // this group
+    RandomIt1 last1, RandomIt2 first2, RandomIt2 last2, OutputIt out,
+    Compare comp, IndexT *segments_begin1, IndexT *segments_begin2,
+    IndexT num_segments,
+    typename std::iterator_traits<RandomIt1>::value_type *local_mem = nullptr) {
+
+  int lid = grp.get_local_linear_id();
+  auto grp_id = grp.get_group_linear_id();
+  int grp_size = grp.get_local_linear_range();
+
+  std::size_t segment_begin1 = segments_begin1[grp_id];
+  std::size_t segment_begin2 = segments_begin2[grp_id];
+
+  RandomIt1 group_first1 = first1;
+  std::advance(group_first1, segment_begin1);
+  RandomIt2 group_first2 = first2;
+  std::advance(group_first2, segment_begin2);
+
+  std::size_t segment_end1 =
+      std::distance(group_first1, last1) + segment_begin1;
+  std::size_t segment_end2 =
+      std::distance(group_first2, last2) + segment_begin2;
+  if(grp_id < num_segments - 1) {
+    segment_end1 = segments_begin1[grp_id + 1];
+    segment_end2 = segments_begin2[grp_id + 1];
+  }
+
+  RandomIt1 group_last1 = first1;
+  std::advance(group_last1, segment_end1);
+
+  RandomIt2 group_last2 = first2;
+  std::advance(group_last2, segment_end2);
+
+  OutputIt group_out = out;
+  std::advance(group_out, grp_id * grp_size);
+  
+  int input_size1 = std::distance(group_first1, group_last1);
+  int input_size2 = std::distance(group_first2, group_last2);
+  auto local_problem_size = input_size1 + input_size2;
+
+  auto load = [](auto it, auto idx) {
+    std::advance(it, idx);
+    return *it;
+  };
+
+  auto store = [](auto it, auto idx, auto v) {
+    std::advance(it, idx);
+    *it = v;
+  };
+
+  auto barrier = [&](){
+    sycl::group_barrier(grp);
+  };
+
+  if(local_mem) {
+    if(lid < input_size1)
+      local_mem[lid] = load(group_first1, lid);
+    if(lid < input_size2)
+      local_mem[lid + input_size1] = load(group_first2, lid);
+    
+    barrier();
+    sorting::bitonic_group_sort(local_mem, grp_size, local_problem_size,
+                                lid, barrier, comp);
+    if(lid < local_problem_size)
+      store(group_out, lid, local_mem[lid]);
+  } else {
+    if(lid < input_size1)
+      store(group_out, lid, load(group_first1, lid));
+    if(lid < input_size2)
+      store(group_out, lid + input_size1, load(group_first2, lid));
+
+    barrier();
+    sorting::bitonic_group_sort(group_out, grp_size, local_problem_size,
+                                lid, barrier, comp);
+  }
+}
+}
+
+/// Precondition: distance(fist1, last1) > 0 && distance(first2, last2) > 0.
+/// Otherwise we cannot run the merge path algorithm for decomposing the merge.
+template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
+sycl::event segmented_merge(sycl::queue &q, RandomIt1 first1, RandomIt1 last1,
+                            RandomIt2 first2, RandomIt2 last2, OutputIt out,
+                            Compare comp,
+                            std::size_t segment_chunk_size = 128,
+                            const std::vector<sycl::event> &deps = {}) {
+
+  //detail::print_merge_matrix(first1, last1, first2, last2, comp);
+
+  std::size_t problem_size = merge_path::num_independent_merges(
+      first1, last1, first2, last2, segment_chunk_size);
+
+  if(problem_size == 0)
+    return sycl::event{};
+
+  return q.parallel_for(sycl::range{problem_size}, deps, [=](sycl::id<1> idx) {
+    detail::segmented_merge(first1, last1, first2, last2, out, comp, idx.get(0),
+                            segment_chunk_size);
+  });
+}
+
+// Assumes that distance(first1,last1)!=0 && distance(first2,last2)!=0
+template <class RandomIt1, class RandomIt2, class OutputIt, class Compare>
+sycl::event
+hierarchical_hybrid_merge(sycl::queue &q, util::allocation_group &scratch,
+                          RandomIt1 first1, RandomIt1 last1, RandomIt2 first2,
+                          RandomIt2 last2, OutputIt out, Compare comp,
+                          std::size_t segment_chunk_size = 128,
+                          const std::vector<sycl::event> &deps = {}) {
+
+  //detail::print_merge_matrix(first1, last1, first2, last2, comp);
+  
+  std::size_t num_merges = merge_path::num_independent_merges(
+      first1, last1, first2, last2, segment_chunk_size);
+  std::size_t* segment_start_scratch = scratch.obtain<std::size_t>(2 * num_merges);
+
+  std::size_t* segment_start_scratch1 = segment_start_scratch;
+  std::size_t* segment_start_scratch2 = segment_start_scratch + num_merges;
+
+  if(num_merges == 0)
+    return sycl::event{};
+
+  sycl::event store_segment_begin_evt =
+      q.parallel_for(sycl::range{num_merges}, deps, [=](sycl::id<1> idx) {
+        detail::store_segment_begin(
+            first1, last1, first2, last2, comp, idx.get(0), segment_chunk_size,
+            segment_start_scratch1, segment_start_scratch2);
+      });
+  
+  std::size_t group_size = segment_chunk_size;
+
+  auto deps2 = deps;
+  if(!q.is_in_order())
+    deps2.push_back(store_segment_begin_evt);
+
+  sycl::event group_sort_evt;
+  
+  using T = typename std::iterator_traits<RandomIt1>::value_type;
+  // TODO: Better to actually check local mem capacity
+  if(sizeof(*first1) <= 16) {
+    group_sort_evt = q.submit([&](sycl::handler& cgh) {
+
+      sycl::local_accessor<T> local_mem {group_size, cgh};
+
+      cgh.depends_on(deps2);
+      cgh.parallel_for(sycl::nd_range<1>{num_merges * group_size, group_size},
+                       [=](sycl::nd_item<1> idx) {
+                         detail::segment_merge_by_group_sort(
+                             idx.get_group(), first1, last1, first2, last2, out,
+                             comp, segment_start_scratch1,
+                             segment_start_scratch2, num_merges,
+                             &(local_mem[0]));
+                       });
+    });
+  } else {
+    group_sort_evt = q.parallel_for(
+      sycl::nd_range<1>{num_merges * group_size, group_size}, deps2,
+      [=](sycl::nd_item<1> idx) {
+        detail::segment_merge_by_group_sort(idx.get_group(), first1, last1, first2,
+                                    last2, out, comp, segment_start_scratch1,
+                                    segment_start_scratch2, num_merges);
+      });
+  }
+
+  return group_sort_evt;
+}
+}
+
+
+
+
+
+#endif
diff --git a/include/hipSYCL/algorithms/merge/merge_path.hpp b/include/hipSYCL/algorithms/merge/merge_path.hpp
new file mode 100644
index 000000000..0b503dbc2
--- /dev/null
+++ b/include/hipSYCL/algorithms/merge/merge_path.hpp
@@ -0,0 +1,184 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_MERGE_PATH_HPP
+#define ACPP_ALGORITHMS_MERGE_PATH_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+#include "../binary_search/index_search.hpp"
+
+namespace hipsycl::algorithms::merging {
+
+
+/// This implements the merge path algorithm, which can be used to decompose a merge
+/// into N disjoint, independent merges which can be run in parallel. For details, see
+/// Green et al. (2014): Merge Path - A Visually Intuitive Approach to Parallel Merging
+/// https://arxiv.org/pdf/1406.2628
+class merge_path {
+public:
+  template <class ForwardIt1, class ForwardIt2, class Compare, class Size>
+  static void
+  nth_independent_merge_begin(ForwardIt1 first1, ForwardIt1 last1,
+                              ForwardIt2 first2, ForwardIt2 last2, Compare comp,
+                              Size partition_index, Size partition_chunk_size,
+                              Size &array1_pos_out, Size &array2_pos_out) {
+
+    Size input1_size = static_cast<Size>(std::distance(first1, last1));
+    Size input2_size = static_cast<Size>(std::distance(first2, last2));
+
+    binary_diag_search(first1, last1, first2, last2, comp, input1_size,
+                       input2_size, partition_index * partition_chunk_size, array1_pos_out,
+                       array2_pos_out);
+  }
+
+  template <class ForwardIt1, class ForwardIt2, class Size>
+  static constexpr Size
+  num_independent_merges(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
+                 ForwardIt2 last2, Size segment_chunk_size) {
+    Size input1_size = static_cast<Size>(std::distance(first1, last1));
+    Size input2_size = static_cast<Size>(std::distance(first2, last2));
+
+    auto num_diags = total_num_diags(input1_size, input2_size);
+
+    return (num_diags + segment_chunk_size - 1) / segment_chunk_size;
+  }
+
+private:
+  template<class ForwardIt, class Size>
+  static auto load(ForwardIt first, Size idx) {
+    std::advance(first, idx);
+    return *first;
+  }
+
+  template<class ForwardIt, class T, class Size>
+  static void store(ForwardIt first, Size idx, const T& val) {
+    std::advance(first, idx);
+    *first = val;
+  }
+
+
+  // Total number of left-bottom-top-right diagonals of the AB matrix
+  template<class Size>
+  static constexpr Size total_num_diags(Size size1, Size size2) {
+    // There are size1 + size2 - 1 "real" diags, but we need an additional diagonal 0 before
+    // the actual data
+    return size1 + size2;
+  }
+
+  template <class ForwardIt1, class ForwardIt2, class Compare, class Size>
+  static void
+  binary_diag_search(ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2,
+                     ForwardIt2 last2, Compare comp,
+                     Size size1, Size size2,
+                     Size diag_index, Size &array1_index_out,
+                     Size &array2_index_out) {
+    
+    if(size1 <= 1 && size2 <= 1) {
+      array1_index_out = 0;
+      array2_index_out = 0;
+      return;
+    }
+
+    Size dlen = diag_length(size1, size2, diag_index);
+    
+
+    if(dlen <= 1) {
+      array1_index_out = 0;
+      array2_index_out = 0;
+      return;
+    }
+
+    // The idea behind the merge path algorithm is to create the merge matrix, where the
+    // [i][j] entries are 1 exactly if comp(first1[i],first2[j]) == true, and 0
+    // otherwise. This matrix will have a contiguous region of zeroes at the
+    // top, the rest will be 1. We can then find the merge path by finding the
+    // highest value where the cross-diagonals in the matrix are 1 using binary
+    // search. Since we only ever care about the merge matrix when binary
+    // searching on the diagonal, this function generates entries from the merge
+    // matrix on-the-fly with just one parameter: the current position on the
+    // diagonal.
+    auto data_loader = [&](auto idx) {
+      auto idx1 = array1_idx_from_diag(size1, size2, diag_index, idx);
+      auto idx2 = array2_idx_from_diag(size1, size2, diag_index, idx);
+
+      // Due to arcane reasons that cannot be expressed in mere mortal words,
+      // the merge matrix needs to be shifted by -1 in the first dimension.
+      // This was revealed to my in a dream.
+      auto v1 = load(first1, idx1 == 0 ? 0 : idx1 - 1);
+      auto v2 = load(first2, idx2);
+      
+      bool res = comp(v1, v2);
+      return static_cast<int>(res);
+    };
+
+    auto compare = [&](int v1, int v2) {
+      // Note: Do NOT use comp() here, since this is used to compare entries
+      // in the merge matrix (which can only be 1 or 0 as generated by data_loader),
+      // not used to compare elements of the user data array!
+      return v1 < v2;
+    };
+
+    // Run binary serach across the index space [0, dlen) to find the first 1
+    // from top to bottom on the current diagonal
+    auto idx = binary_searching::index_upper_bound(Size{0}, dlen, 0,
+                                                   data_loader, compare);
+
+    array1_index_out = array1_idx_from_diag(size1, size2, diag_index, idx);
+    array2_index_out = array2_idx_from_diag(size1, size2, diag_index, idx);
+  }
+
+  template <class Size>
+  static constexpr Size diag_length(Size size1, Size size2, Size diag_idx) {
+    if(diag_idx < size1 && diag_idx < size2)
+      return diag_idx;
+
+    auto min_size = std::min(size1, size2);
+    auto max_size = std::max(size1, size2);
+
+    if(diag_idx >= min_size && diag_idx <= max_size)
+      return min_size;
+
+    return total_num_diags(size1, size2) - diag_idx;
+  }
+
+  // position on diag is incremented from the top of the matrix to the bottom.
+  template <class Size>
+  static constexpr Size array1_idx_from_diag(Size size1, Size size2,
+                                             Size diag_idx,
+                                             Size position_on_diag) {
+    // Note: We need to use size and *not* size-1 in this expression.
+    // The position must be able to become invalid so that we can express when
+    // we only need elements from array2 for the merge and we have left array1.
+    auto diag_start = std::min(diag_idx, size1);
+    return diag_start - position_on_diag;
+  }
+
+  template <class Size>
+  static constexpr Size array2_idx_from_diag(Size size1, Size size2,
+                                             Size diag_idx,
+                                             Size position_on_diag) {
+    if(diag_idx <= size1)
+      return position_on_diag;
+    return diag_idx - size1 + position_on_diag;
+  }
+};
+
+
+
+
+}
+
+
+
+#endif
diff --git a/include/hipSYCL/algorithms/numeric.hpp b/include/hipSYCL/algorithms/numeric.hpp
index bf3bbbf3d..5d7013c83 100644
--- a/include/hipSYCL/algorithms/numeric.hpp
+++ b/include/hipSYCL/algorithms/numeric.hpp
@@ -21,10 +21,13 @@
 #include "hipSYCL/sycl/libkernel/functional.hpp"
 #include "hipSYCL/sycl/event.hpp"
 #include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/sycl/detail/namespace_compat.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_descriptor.hpp"
 #include "hipSYCL/algorithms/reduction/reduction_engine.hpp"
+#include "hipSYCL/algorithms/scan/scan.hpp"
 #include "hipSYCL/algorithms/util/memory_streaming.hpp"
 
+
 namespace hipsycl::algorithms {
 
 namespace detail {
@@ -67,7 +70,8 @@ sycl::event wg_model_reduction(sycl::queue &q,
                                util::allocation_group &scratch_allocations,
                                T *output, T init, std::size_t target_num_groups,
                                std::size_t local_size, std::size_t problem_size,
-                               Kernel k, BinaryReductionOp op) {
+                               Kernel k, BinaryReductionOp op,
+                               const std::vector<sycl::event>& deps = {}) {
   assert(target_num_groups > 0);
 
   sycl::event last_event;
@@ -119,6 +123,7 @@ sycl::event wg_model_reduction(sycl::queue &q,
 
   last_event = q.submit([&](sycl::handler &cgh) {
     sycl::local_accessor<char> acc{sycl::range<1>{main_kernel_local_mem}, cgh};
+    cgh.depends_on(deps);
     cgh.parallel_for(sycl::nd_range<1>{dispatched_global_size, local_size},
                     main_kernel);
   });
@@ -133,60 +138,25 @@ template <class T, class Kernel, class BinaryReductionOp>
 sycl::event
 wg_model_reduction(sycl::queue &q, util::allocation_group &scratch_allocations,
                    T *output, T init, std::size_t target_num_groups,
-                   std::size_t problem_size, Kernel k, BinaryReductionOp op) {
+                   std::size_t problem_size, Kernel k, BinaryReductionOp op,
+                   const std::vector<sycl::event>& deps = {}) {
   return wg_model_reduction(q, scratch_allocations, output, init,
-                                  target_num_groups, 128, problem_size, k, op);
+                                  target_num_groups, 128, problem_size, k, op, deps);
 }
 
-template <class T, class Kernel, class BinaryReductionOp>
-sycl::event threading_model_reduction(sycl::queue &q,
-                                  util::allocation_group &scratch_allocations,
-                                  T *output, T init, std::size_t n, Kernel k,
-                                  BinaryReductionOp op) {
-
-  sycl::event last_event;
-  auto single_task_launcher =
-      [&](auto kernel) {
-        last_event = q.single_task(kernel);
-      };
-
-  auto operator_config = get_reduction_operator_configuration<T>(op);
-  auto reduction_descriptor = reduction::reduction_descriptor{
-      operator_config, init, output};
-  
-  reduction::threading_model::omp_thread_info_query thread_info_query;
-  reduction::threading_reduction_engine engine{thread_info_query,
-                                               &scratch_allocations};
-  auto plan = engine.create_plan(n, reduction_descriptor);
-  auto main_kernel = engine.make_main_reducing_kernel(k, plan);
-  
-  last_event = q.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(sycl::range<1>{n},
-                     main_kernel);
-  });
-
-  engine.run_additional_kernels(single_task_launcher, plan);
-  
-  return last_event;
-}
 
 template <class T, class Kernel, class BinaryReductionOp>
 sycl::event transform_reduce_impl(sycl::queue &q,
                                   util::allocation_group &scratch_allocations,
                                   T *output, T init, std::size_t n, Kernel k,
-                                  BinaryReductionOp op) {
-  if(q.get_device().is_host()) {
-#ifdef HIPSYCL_ALGORITHMS_TRANSFORM_REDUCE_HOST_THREADING_MODEL
-    return threading_model_reduction(q, scratch_allocations, output, init, n, k,
-                                     op);
-#endif
-  }
+                                  BinaryReductionOp op,
+                                  const std::vector<sycl::event>& deps) {
   sycl::device dev = q.get_device();
   std::size_t num_groups =
       dev.get_info<sycl::info::device::max_compute_units>() * 4;
 
   return wg_model_reduction(q, scratch_allocations, output, init, num_groups,
-                            n, k, op);
+                            n, k, op, deps);
 
 }
 
@@ -203,7 +173,8 @@ sycl::event
 transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                  ForwardIt1 first1, ForwardIt1 last1, ForwardIt2 first2, T *out,
                  T init, BinaryReductionOp reduce,
-                 BinaryTransformOp transform) {
+                 BinaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {}) {
   if(first1 == last1)
     return sycl::event{};
   
@@ -217,7 +188,7 @@ transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
   };
 
   return detail::transform_reduce_impl(q, scratch_allocations, out, init, n,
-                                       kernel, reduce);
+                                       kernel, reduce, deps);
 }
 
 template <class ForwardIt, class T, class BinaryReductionOp,
@@ -225,7 +196,8 @@ template <class ForwardIt, class T, class BinaryReductionOp,
 sycl::event
 transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                  ForwardIt first, ForwardIt last, T* out, T init,
-                 BinaryReductionOp reduce, UnaryTransformOp transform) {
+                 BinaryReductionOp reduce, UnaryTransformOp transform,
+                 const std::vector<sycl::event>& deps = {}) {
   if(first == last)
     return sycl::event{};
   
@@ -237,41 +209,124 @@ transform_reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
   };
 
   return detail::transform_reduce_impl(q, scratch_allocations, out, init, n,
-                                       kernel, reduce);
+                                       kernel, reduce, deps);
 }
 
 template <class ForwardIt1, class ForwardIt2, class T>
 sycl::event transform_reduce(sycl::queue &q,
                              util::allocation_group &scratch_allocations,
                              ForwardIt1 first1, ForwardIt1 last1,
-                             ForwardIt2 first2, T *out, T init) {
+                             ForwardIt2 first2, T *out, T init,
+                             const std::vector<sycl::event>& deps = {}) {
   return transform_reduce(q, scratch_allocations, first1, last1, first2, out,
-                          init, std::plus<T>{}, std::multiplies<T>{});
+                          init, std::plus<T>{}, std::multiplies<T>{}, deps);
 }
 
 
 template <class ForwardIt, class T, class BinaryOp>
 sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                    ForwardIt first, ForwardIt last, T *out, T init,
-                   BinaryOp binary_op) {
+                   BinaryOp binary_op,
+                   const std::vector<sycl::event>& deps = {}) {
   return transform_reduce(q, scratch_allocations, first, last, out, init,
-                          binary_op, [](auto x) { return x; });
+                          binary_op, [](auto x) { return x; }, deps);
 }
 
 template <class ForwardIt, class T>
 sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
-                   ForwardIt first, ForwardIt last, T *out, T init) {
-  return reduce(q, scratch_allocations, first, last, out, init, std::plus<T>{});
+                   ForwardIt first, ForwardIt last, T *out, T init,
+                   const std::vector<sycl::event>& deps = {}) {
+  return reduce(q, scratch_allocations, first, last, out, init, std::plus<T>{}, deps);
 }
 
 template <class ForwardIt>
 sycl::event reduce(sycl::queue &q, util::allocation_group &scratch_allocations,
                    ForwardIt first, ForwardIt last,
-                   typename std::iterator_traits<ForwardIt>::value_type *out) {
+                   typename std::iterator_traits<ForwardIt>::value_type *out,
+                   const std::vector<sycl::event>& deps = {}) {
   return reduce(q, scratch_allocations, first, last, out,
-                typename std::iterator_traits<ForwardIt>::value_type{});
+                typename std::iterator_traits<ForwardIt>::value_type{}, deps);
 }
 
+///////////////////////////// scans /////////////////////////////////////
+
+template <class InputIt, class OutputIt, class BinaryOp>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               const std::vector<sycl::event> &deps = {}) {
+
+  return scanning::scan<true>(q, scratch_allocations, first, last, d_first, op,
+                              std::nullopt, deps);
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+sycl::event
+inclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+               T init, const std::vector<sycl::event> &deps = {}) {
+  return scanning::scan<true>(q, scratch_allocations, first, last, d_first, op,
+                              init, deps);
+}
+
+template <class InputIt, class OutputIt>
+sycl::event inclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           const std::vector<sycl::event> &deps = {}) {
+  return inclusive_scan(q, scratch_allocations, first, last, d_first,
+                        std::plus<>{}, deps);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+sycl::event
+exclusive_scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op, const std::vector<sycl::event> &deps = {}) {
+  return scanning::scan<false>(q, scratch_allocations, first, last, d_first, op,
+                               init, deps);
 }
 
+template <class InputIt, class OutputIt, class T>
+sycl::event exclusive_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init, const std::vector<sycl::event> &deps = {}) {
+  return exclusive_scan(q, scratch_allocations, first, last, d_first, init,
+                        std::plus<>{}, deps);
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    const std::vector<sycl::event> &deps = {}) {
+  return scanning::transform_scan<true>(q, scratch_allocations, first, last,
+                                        d_first, unary_op, binary_op,
+                                        std::nullopt, deps);
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class UnaryOp, class T>
+sycl::event transform_inclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, BinaryOp binary_op, UnaryOp unary_op,
+    T init, const std::vector<sycl::event> &deps = {}) {
+  return scanning::transform_scan<true>(q, scratch_allocations, first, last,
+                                        d_first, unary_op, binary_op,
+                                        init, deps);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp, class UnaryOp>
+sycl::event transform_exclusive_scan(
+    sycl::queue &q, util::allocation_group &scratch_allocations, InputIt first,
+    InputIt last, OutputIt d_first, T init, BinaryOp binary_op,
+    UnaryOp unary_op, const std::vector<sycl::event> &deps = {}) {
+  return scanning::transform_scan<false>(q, scratch_allocations, first, last,
+                                         d_first, unary_op, binary_op, init,
+                                         deps);
+}
+
+} // algorithms
+
+
 #endif
diff --git a/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
new file mode 100644
index 000000000..f81fd0399
--- /dev/null
+++ b/include/hipSYCL/algorithms/scan/decoupled_lookback_scan.hpp
@@ -0,0 +1,658 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_DECOUPLED_LOOKBACK_SCAN_HPP
+#define ACPP_ALGORITHMS_DECOUPLED_LOOKBACK_SCAN_HPP
+
+#include <iterator>
+#include <cstdint>
+#include <functional>
+#include <type_traits>
+#include <optional>
+#include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/sycl/libkernel/atomic_ref.hpp"
+#include "hipSYCL/sycl/libkernel/group_functions.hpp"
+#include "hipSYCL/sycl/jit.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
+
+namespace hipsycl::algorithms::scanning {
+
+namespace detail {
+
+enum class status : uint32_t {
+  invalid = 0,
+  aggregate_available = 1,
+  prefix_available = 2
+};
+
+template<class T>
+struct scratch_data {
+  scratch_data(util::allocation_group &scratch, std::size_t num_groups) {
+    group_aggregate = scratch.obtain<T>(num_groups);
+    inclusive_prefix = scratch.obtain<T>(num_groups);
+    group_status = scratch.obtain<status>(num_groups);
+  }
+
+  T* group_aggregate;
+  T* inclusive_prefix;
+  status* group_status;
+};
+
+template <class T, class BinaryOp>
+T kogge_stone_scan(sycl::nd_item<1> idx, T my_element, BinaryOp op,
+                   T *local_mem) {
+  const int lid = idx.get_local_linear_id();
+  const int local_size = idx.get_local_range().size();
+  local_mem[lid] = my_element;
+
+  for (unsigned stride = 1; stride < local_size; stride <<= 1) {
+    sycl::group_barrier(idx.get_group());
+    T current = my_element;
+    if (lid >= stride) {
+      current = op(local_mem[lid - stride], local_mem[lid]);
+    }
+    sycl::group_barrier(idx.get_group());
+    
+    if (lid >= stride) {
+      local_mem[lid] = current;
+    }
+  }
+
+  auto result = local_mem[lid];
+  sycl::group_barrier(idx.get_group());
+  return result;
+}
+
+template <class T, class BinaryOp>
+T sequential_scan(sycl::nd_item<1> idx, T my_element, BinaryOp op,
+                   T *local_mem) {
+  int lid = idx.get_local_linear_id();
+  local_mem[lid] = my_element;
+  sycl::group_barrier(idx.get_group());
+
+  if(lid == 0) {
+    T current = local_mem[0];
+    for(int i = 1; i < idx.get_local_range().size(); ++i) {
+      current = op(current, local_mem[i]);
+      local_mem[i] = current;
+    }
+  }
+  sycl::group_barrier(idx.get_group());
+  auto result = local_mem[lid];
+  sycl::group_barrier(idx.get_group());
+  return result;
+}
+
+template<class T, class BinaryOp>
+constexpr bool can_use_group_algorithms() {
+  // TODO
+  return false;
+}
+
+template <class T, class BinaryOp>
+T collective_inclusive_group_scan(sycl::nd_item<1> idx, T my_element,
+                                  BinaryOp op, T *local_mem) {
+  if constexpr(can_use_group_algorithms<T, BinaryOp>()) {
+    // TODO
+  } else {
+    namespace jit = sycl::AdaptiveCpp_jit;
+    __acpp_if_target_sscp(
+      // For some reason, using the compile_if_else wrapper introduces
+      // overheads for host JIT in this case :(
+      // This seems to be unique to this particular case here though.
+      if(jit::reflect<jit::reflection_query::compiler_backend>() ==
+              jit::compiler_backend::host) {
+        return sequential_scan<T, BinaryOp>(idx, my_element, op, local_mem);
+      } else {
+        return kogge_stone_scan<T, BinaryOp>(idx, my_element, op, local_mem);
+      }
+    );
+    return kogge_stone_scan<T, BinaryOp>(idx, my_element, op, local_mem);
+  }
+}
+
+template<class T, class BinaryOp>
+T collective_broadcast(sycl::nd_item<1> idx, T x, int local_id, T* local_mem) {
+  if constexpr(can_use_group_algorithms<T, BinaryOp>()) {
+    // TODO
+  } else {
+    if(idx.get_local_linear_id() == local_id) {
+      *local_mem = x;
+    }
+    sycl::group_barrier(idx.get_group());
+    auto result = *local_mem;
+    sycl::group_barrier(idx.get_group());
+    return result;
+  }
+}
+
+template <int WorkPerItem, class T, class BinaryOp, class Generator,
+          class Processor, class PrefixHandler>
+void iterate_host_and_inclusive_group_scan(
+    sycl::nd_item<1> idx, BinaryOp op, T *local_mem, std::size_t global_group_id,
+    Generator gen, Processor result_processor,
+    PrefixHandler local_prefix_to_global_prefix) {
+  
+  const int lid = idx.get_local_linear_id();
+  const int group_size = idx.get_local_range().size();
+
+  const int num_elements = group_size * WorkPerItem;
+  if(lid == 0) {
+    T current_inclusive_scan;
+    for(int i = 0; i < num_elements; ++i) {
+      T current_element = gen(idx, i % WorkPerItem, i);
+      if(i == 0)
+        current_inclusive_scan = current_element;
+      else
+        current_inclusive_scan = op(current_inclusive_scan, current_element);
+      // we store the result array at i+1 to avoid conflicts with the
+      // fallback group broadcast, which uses element 0.
+      local_mem[i+1] = current_inclusive_scan;
+    }
+  }
+  sycl::group_barrier(idx.get_group());
+  T global_prefix = local_prefix_to_global_prefix(
+      // Index is not -1 because we store the array at offset 1.
+      lid, local_mem[group_size * WorkPerItem]);
+  sycl::group_barrier(idx.get_group());
+  if(global_group_id != 0 && lid == 0) {
+    for(int i = 1; i <= num_elements; ++i) {
+      local_mem[i] = op(global_prefix, local_mem[i]);
+    }
+  }
+  sycl::group_barrier(idx.get_group());
+  
+  for(int i = 0; i < WorkPerItem; ++i) {
+    int effective_id = lid * WorkPerItem + i;
+    result_processor(i, effective_id, local_mem[effective_id+1]);
+  }
+}
+
+template <int WorkPerItem, class T, class BinaryOp, class Generator,
+          class Processor, class PrefixHandler>
+void iterate_and_inclusive_group_scan(
+    sycl::nd_item<1> idx, BinaryOp op, T *local_mem, std::size_t global_group_id,
+    Generator gen, Processor result_processor,
+    PrefixHandler local_prefix_to_global_prefix) {
+
+  const int lid = idx.get_local_linear_id();
+  const int group_size = idx.get_local_range().size();
+  
+
+  T current_exclusive_prefix;
+  T scan_result [WorkPerItem];
+  for(int invocation = 0; invocation < WorkPerItem; ++invocation) {
+    int current_id = invocation * group_size + lid;
+    T my_element = gen(idx, invocation, current_id);
+    T local_scan_result =
+        collective_inclusive_group_scan(idx, my_element, op, local_mem);
+    
+    if(invocation != 0)
+      local_scan_result = op(current_exclusive_prefix, local_scan_result);
+    
+    current_exclusive_prefix = collective_broadcast<T, BinaryOp>(
+        idx, local_scan_result, group_size - 1, local_mem);
+    
+    scan_result[invocation] = local_scan_result;
+  }
+  // has local prefix here, this also does lookback
+  T global_prefix = local_prefix_to_global_prefix(lid, current_exclusive_prefix);
+
+  if(global_group_id != 0) {
+    for(int i = 0; i < WorkPerItem; ++i) {
+      scan_result[i] =  op(global_prefix, scan_result[i]);
+    }  
+  }
+  for(int i = 0; i < WorkPerItem; ++i) {
+    result_processor(i, i*group_size+lid, scan_result[i]);
+  }
+}
+
+template <class T, class BinaryOp>
+T exclusive_prefix_look_back(const T &dummy_init, int effective_group_id,
+                             detail::status *status, T *group_aggregate,
+                             T *inclusive_prefix, BinaryOp op) {
+  // dummy_init is a dummy value here; avoid relying on default constructor
+  // in case T has none.
+  T exclusive_prefix = dummy_init;
+  bool exclusive_prefix_initialized = false;
+
+  auto update_exclusive_prefix = [&](auto x){
+    if(!exclusive_prefix_initialized) {
+      exclusive_prefix = x;
+      exclusive_prefix_initialized = true;
+    } else {
+      exclusive_prefix = op(x, exclusive_prefix);
+    }
+  };
+
+  for(int lookback_group = effective_group_id - 1; lookback_group >= 0; --lookback_group) {
+    uint32_t& status_ptr = reinterpret_cast<uint32_t&>(status[lookback_group]);
+    sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
+                  sycl::memory_scope::device,
+                  sycl::access::address_space::global_space> status_ref{status_ptr};
+
+    detail::status lookback_status;
+    while ((lookback_status = static_cast<detail::status>(status_ref.load())) ==
+            detail::status::invalid)
+      ;
+    
+    if(lookback_status == detail::status::prefix_available) {
+      update_exclusive_prefix(inclusive_prefix[lookback_group]);
+      return exclusive_prefix;
+    } else {
+      update_exclusive_prefix(group_aggregate[lookback_group]);
+    }
+  }
+
+  return exclusive_prefix;
+}
+
+template <bool IsInclusive, class T, class Generator, class OptionalInitT,
+          class BinaryOp>
+T load_data_element(Generator &&gen, sycl::nd_item<1> idx, BinaryOp op,
+                    uint32_t effective_group_id, std::size_t global_id,
+                    std::size_t problem_size, OptionalInitT init) {
+  if constexpr (IsInclusive) {
+    auto elem = gen(idx, effective_group_id, global_id, problem_size);
+    if constexpr(!std::is_same_v<OptionalInitT, std::nullopt_t>) {
+      if(global_id == 0) {
+        return op(init, elem);
+      }
+    }
+    return elem;
+  } else {
+    if(global_id == 0)
+      return init;
+    return gen(idx, effective_group_id, global_id - 1, problem_size);
+  }
+}
+
+template <bool IsInclusive, class T, class OptionalInitT, class BinaryOp,
+          class Generator, class Processor>
+void flat_group_scan_kernel(sycl::nd_item<1> idx, T *local_memory,
+                            scratch_data<T> scratch, uint32_t *group_counter,
+                            BinaryOp op, OptionalInitT init,
+                            std::size_t problem_size, Generator &&gen,
+                            Processor &&processor) {
+  sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                    sycl::memory_scope::device,
+                    sycl::access::address_space::global_space>
+      group_id_counter{*group_counter};
+
+  const int local_id = idx.get_local_linear_id();
+  uint32_t effective_group_id = idx.get_group_linear_id();
+  if(local_id == 0) {
+    effective_group_id = group_id_counter.fetch_add(static_cast<uint32_t>(1));
+  }
+  effective_group_id = collective_broadcast<uint32_t, BinaryOp>(
+      idx, effective_group_id, 0, reinterpret_cast<uint32_t*>(local_memory));
+
+  const std::size_t global_id = effective_group_id * idx.get_local_range().size() +
+                          local_id;
+  
+  int local_size = idx.get_local_range().size();
+  
+  std::size_t num_groups = idx.get_group_range().size();
+  const bool is_last_group = effective_group_id == num_groups - 1;
+  if(is_last_group) {
+    std::size_t group_offset = effective_group_id * (num_groups - 1) + local_size;
+    local_size = problem_size - group_offset;
+  }
+
+  // This invokes gen for the current work item to obtain our data element
+  // for the scan. If we are dealing with an exclusive scan, load_data_element
+  // shifts the data access by 1, thus allowing us to treat the scan as inclusive
+  // in the subsequent algorithm.
+  // It also applies init to the first data element, if provided.
+  T my_element = load_data_element<IsInclusive, T>(
+      gen, idx, op, effective_group_id, global_id, problem_size, init);
+  
+  // The exclusive scan case is handled in load_element() by accessing the element
+  // at global_id-1 instead of global_id.
+  T local_scan_result =
+          collective_inclusive_group_scan(idx, my_element, op, local_memory);
+
+  uint32_t *status_ptr =
+        reinterpret_cast<uint32_t *>(&scratch.group_status[effective_group_id]);
+    sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
+                  sycl::memory_scope::device,
+                  sycl::access::address_space::global_space> status_ref{*status_ptr};
+
+  // Set group aggregate which we now know after scan. The first group
+  // Can also set its prefix and is done.
+  if(local_id == local_size - 1) {
+    T group_aggregate = local_scan_result;
+    
+    if(effective_group_id == 0) {
+      scratch.group_aggregate[effective_group_id] = group_aggregate;
+      scratch.inclusive_prefix[effective_group_id] = group_aggregate;
+      status_ref.store(static_cast<uint32_t>(status::prefix_available));
+    } else {
+      scratch.group_aggregate[effective_group_id] = group_aggregate;
+      status_ref.store(static_cast<uint32_t>(status::aggregate_available));
+    }
+  }
+
+  sycl::group_barrier(idx.get_group());
+
+  // All groups except group 0 need to perform lookback to find their prefix
+  if(effective_group_id != 0) {
+    // my_element is a dummy value here; avoid relying on default constructor
+    // in case T has none
+    T exclusive_prefix = my_element;
+    if(local_id == 0) {
+      exclusive_prefix = exclusive_prefix_look_back(my_element, effective_group_id,
+                          scratch.group_status, scratch.group_aggregate,
+                          scratch.inclusive_prefix, op);
+    }
+    exclusive_prefix = collective_broadcast<T, BinaryOp>(
+        idx, exclusive_prefix, 0, local_memory);
+    local_scan_result = op(exclusive_prefix, local_scan_result);
+
+    // All groups except first and last one need to update their prefix
+    if(effective_group_id != num_groups - 1) {
+      if(local_id == local_size - 1){
+        scratch.inclusive_prefix[effective_group_id] = local_scan_result;
+        status_ref.store(static_cast<uint32_t>(status::prefix_available));
+      }
+    }
+  }
+
+  processor(idx, effective_group_id, global_id, problem_size, local_scan_result);
+}
+
+template <int WorkPerItem, bool IsInclusive, class T, class OptionalInitT,
+          class BinaryOp, class Generator, class Processor>
+void scan_kernel(sycl::nd_item<1> idx, T *local_memory, scratch_data<T> scratch,
+                 uint32_t *group_counter, BinaryOp op, OptionalInitT init,
+                 std::size_t problem_size, Generator &&data_generator,
+                 Processor &&processor) {
+  sycl::atomic_ref<uint32_t, sycl::memory_order::relaxed,
+                    sycl::memory_scope::device,
+                    sycl::access::address_space::global_space>
+      group_id_counter{*group_counter};
+
+  const int local_id = idx.get_local_linear_id();
+  const int local_size = idx.get_local_range().size();
+  uint32_t effective_group_id = idx.get_group_linear_id();
+  if(local_id == 0) {
+    effective_group_id = group_id_counter.fetch_add(static_cast<uint32_t>(1));
+  }
+  effective_group_id = collective_broadcast<uint32_t, BinaryOp>(
+      idx, effective_group_id, 0, reinterpret_cast<uint32_t*>(local_memory));
+  
+
+  auto generator = [=](sycl::nd_item<1> idx, int invocation, int current_local_id) {
+    // This invokes gen for the current work item to obtain our data element
+    // for the scan. If we are dealing with an exclusive scan, load_data_element
+    // shifts the data access by 1, thus allowing us to treat the scan as inclusive
+    // in the subsequent algorithm.
+    // It also applies init to the first data element, if provided.
+    std::size_t global_id =
+        effective_group_id * local_size * WorkPerItem + current_local_id;
+
+    return load_data_element<IsInclusive, T>(
+      data_generator, idx, op, effective_group_id, global_id, problem_size, init);
+  };
+
+  auto local_prefix_to_global_prefix = [=](int local_id,
+                                           const T &local_inclusive_prefix) {
+    uint32_t *status_ptr =
+        reinterpret_cast<uint32_t *>(&scratch.group_status[effective_group_id]);
+    sycl::atomic_ref<uint32_t, sycl::memory_order::acq_rel,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space>
+        status_ref{*status_ptr};
+
+    // Set group aggregate which we now know after scan. The first group
+    // Can also set its prefix and is done.
+    if (local_id == 0) {
+      if (effective_group_id == 0) {
+        scratch.group_aggregate[effective_group_id] = local_inclusive_prefix;
+        scratch.inclusive_prefix[effective_group_id] = local_inclusive_prefix;
+        status_ref.store(static_cast<uint32_t>(status::prefix_available));
+      } else {
+        scratch.group_aggregate[effective_group_id] = local_inclusive_prefix;
+        status_ref.store(static_cast<uint32_t>(status::aggregate_available));
+      }
+    }
+
+    sycl::group_barrier(idx.get_group());
+
+    // All groups except group 0 need to perform lookback to find their prefix
+    T exclusive_prefix;
+    if(effective_group_id != 0) {
+      if(local_id == 0) {
+        exclusive_prefix = exclusive_prefix_look_back(
+            exclusive_prefix, effective_group_id, scratch.group_status,
+            scratch.group_aggregate, scratch.inclusive_prefix, op);
+      }
+      exclusive_prefix = collective_broadcast<T, BinaryOp>(
+          idx, exclusive_prefix, 0, local_memory);
+
+      // All groups except first need to update their prefix
+      if(local_id == local_size - 1){
+        scratch.inclusive_prefix[effective_group_id] =
+            op(exclusive_prefix, local_inclusive_prefix);
+        status_ref.store(static_cast<uint32_t>(status::prefix_available));
+      }
+    }
+    return exclusive_prefix;
+  };
+
+  auto result_processor = [=](int invocation_id, int current_local_id,
+                              T scan_result) {
+    std::size_t global_id =
+        effective_group_id * local_size * WorkPerItem + current_local_id;
+    processor(idx, effective_group_id, global_id, problem_size, scan_result);
+  };
+
+  __acpp_if_target_sscp(
+      namespace jit = sycl::AdaptiveCpp_jit;
+      if (jit::reflect<jit::reflection_query::compiler_backend>() ==
+          jit::compiler_backend::host) {
+        iterate_host_and_inclusive_group_scan<WorkPerItem>(
+            idx, op, local_memory, effective_group_id, generator,
+            result_processor, local_prefix_to_global_prefix);
+        return;
+      });
+  __acpp_if_target_host(
+    iterate_host_and_inclusive_group_scan<WorkPerItem>(
+            idx, op, local_memory, effective_group_id, generator,
+            result_processor, local_prefix_to_global_prefix);
+        return;
+  );
+  // Only executed for non-host
+  iterate_and_inclusive_group_scan<WorkPerItem>(
+      idx, op, local_memory, effective_group_id, generator, result_processor,
+      local_prefix_to_global_prefix);
+
+}
+
+template<class T>
+constexpr int work_per_item() {
+  if constexpr(!std::is_constructible_v<T>)
+    return 1;
+  else {
+    return 16;
+  }
+}
+
+template <bool IsInclusive, class T, class OptionalInitT,
+          class BinaryOp, class Generator, class Processor>
+void select_and_run_scan_kernel(sycl::nd_item<1> idx,
+                                T *local_memory, scratch_data<T> scratch,
+                                uint32_t *group_counter, BinaryOp op,
+                                OptionalInitT init, std::size_t problem_size,
+                                Generator &&data_generator,
+                                Processor &&processor) {
+  if constexpr (!std::is_constructible_v<T>) {
+    flat_group_scan_kernel<IsInclusive>(idx, local_memory, scratch,
+                                        group_counter, op, init, problem_size,
+                                        data_generator, processor);
+  } else {
+    scan_kernel<work_per_item<T>(), IsInclusive>(
+        idx, local_memory, scratch, group_counter, op, init, problem_size,
+        data_generator, processor);
+  }
+}
+
+} // detail
+
+
+/// Implements the decoupled lookback scan algorithm -
+/// See Merill, Garland (2016) for details.
+///
+/// This algorithm assumes that the hardware can support acquire/release
+/// atomics.
+/// It also assumes that work groups with smaller ids are either scheduled
+/// before work groups with higher ids, or that work group execution may be
+/// preempted. To provide this guarantee universally, our implementation
+/// reassigns work group ids based on when they start executing.
+///
+/// \param gen A callable with signature \c T(nd_item<1>, uint32_t
+/// effective_group_id, size_t effective_global_id, size_t problem_size)
+///
+/// \c gen is the generator that generates the data elements to run the scan.
+/// Note that the scan implementation may reorder work-groups; \c gen should
+/// therefore not rely on the group id and global id from the provided nd_item,
+/// but instead use the provided \c effective_group_id and and \c
+/// effective_global_id.
+///
+/// If the problem size is not divisible by the selected work group size, then
+/// the last group might invoke \c gen with ids outside the bound. It is the
+/// responsibility of \c gen to handle this case. For these work items, the
+/// return value from \c gen can be an arbitrary dummy value (e.g. the last
+/// valid element within bounds).
+///
+/// \param processor A callable with signature \c void(nd_item<1>,  uint32_t
+/// effective_group_id, size_t effective_global_id, size_t problem_size, T
+/// result)
+///
+/// \c processor is invoked at the end of the scan with the result of the global
+/// scan for this particular work item. \c processor will be invoked once the
+/// global result for the work item is available which might be before the scan
+/// has completed for all work items. Do not assume global synchronization.
+///
+/// Note that the scan implementation may reorder work-groups; \c processor
+/// should therefore not rely on the group id and global id from the provided
+/// nd_item, but instead use the provided \c effective_group_id and and \c
+/// effective_global_id.
+///
+/// If the problem size is not divisible by the selected work group size, then
+/// the last group might invoke \c processor with ids outside the bound. It is
+/// the responsibility of \c processor to handle this case. For these work
+/// items, the result value passed into \c processor is undefined.
+template <bool IsInclusive, class T, class WorkItemDataGenerator, class ResultProcessor,
+          class BinaryOp, class OptionalInitT>
+sycl::event
+decoupled_lookback_scan(sycl::queue &q, util::allocation_group &scratch_alloc,
+                        WorkItemDataGenerator gen,
+                        ResultProcessor processor, BinaryOp op,
+                        std::size_t problem_size, std::size_t group_size,
+                        OptionalInitT init = std::nullopt,
+                        const std::vector<sycl::event> &user_deps = {}) {
+
+  if(problem_size == 0)
+    return sycl::event{};
+
+  static_assert(IsInclusive || std::is_convertible_v<OptionalInitT, T>,
+                "Non-inclusive scans need an init argument of same type as the "
+                "scan data element");
+  static_assert(
+      std::is_convertible_v<OptionalInitT, T> ||
+          std::is_same_v<OptionalInitT, std::nullopt_t>,
+      "Init argument must be of std::nullopt_t type or exact type of scan "
+      "data elements");
+
+  std::size_t num_items = (problem_size + detail::work_per_item<T>() - 1) /
+                          detail::work_per_item<T>();
+  std::size_t num_groups = (num_items + group_size - 1) / group_size;
+
+  detail::scratch_data<T> scratch{scratch_alloc, num_groups};
+  uint32_t* group_counter = scratch_alloc.obtain<uint32_t>(1);
+
+  auto initialization_evt = q.parallel_for(num_groups, [=](sycl::id<1> idx){
+    scratch.group_status[idx] = detail::status::invalid;
+    if(idx.get(0) == 0) {
+      *group_counter = 0;
+    }
+  });
+
+  std::vector<sycl::event> deps = user_deps;
+  if(!q.is_in_order())
+    deps.push_back(initialization_evt);
+
+  bool is_host = q.get_device().get_backend() == sycl::backend::omp;
+
+  sycl::nd_range<1> kernel_range{num_groups * group_size, group_size};
+  if constexpr(detail::can_use_group_algorithms<T, BinaryOp>()) {
+    if(!is_host) {
+      return q.parallel_for(kernel_range, deps, [=](auto idx) {
+        detail::select_and_run_scan_kernel<IsInclusive>(
+            idx, static_cast<T *>(nullptr), scratch,
+            group_counter, op, init, problem_size, gen, processor);
+      });
+    }
+  }
+  
+  // We need local memory:
+  // - 1 data element per work item
+  // - at least size for one uint32_t to broadcast group id
+  std::size_t local_mem_elements =
+      std::max(group_size, (sizeof(uint32_t) + sizeof(T) - 1) / sizeof(T));
+  if(is_host) {
+    // host also needs one element per every processed element
+    local_mem_elements *= detail::work_per_item<T>();
+    // ... in addition to broadcast!
+    ++local_mem_elements;
+  }
+
+  // This is not entirely correct since max local mem size can also depend
+  // on work group size.
+  // We also assume that there is no other local memory consumer.
+  // TODO Improve this
+  std::size_t max_local_size =
+      q.get_device().get_info<sycl::info::device::local_mem_size>();
+
+  
+  bool has_sufficient_local_memory =
+      is_host || static_cast<double>(max_local_size) >=
+                      1.5 * sizeof(T) * local_mem_elements;
+
+  if(has_sufficient_local_memory) {
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(deps);
+
+      sycl::local_accessor<T, 1> local_mem{local_mem_elements, cgh};
+      cgh.parallel_for(kernel_range, [=](auto idx) {
+        detail::select_and_run_scan_kernel<IsInclusive>(
+            idx, &(local_mem[0]), scratch, group_counter,
+            op, init, problem_size, gen, processor);
+      });
+    });
+  } else {
+    // This is a super inefficient dummy algorithm for now that requires
+    // large scratch storage
+    T* emulated_local_mem = scratch_alloc.obtain<T>(num_groups * local_mem_elements);
+
+    return q.parallel_for(kernel_range, deps, [=](auto idx) {
+      detail::select_and_run_scan_kernel<IsInclusive>(
+          idx,
+          emulated_local_mem + local_mem_elements * idx.get_group_linear_id(),
+          scratch, group_counter, op, init, problem_size, gen, processor);
+    });
+  }
+}
+}
+
+#endif
diff --git a/include/hipSYCL/algorithms/scan/scan.hpp b/include/hipSYCL/algorithms/scan/scan.hpp
new file mode 100644
index 000000000..4f4581be6
--- /dev/null
+++ b/include/hipSYCL/algorithms/scan/scan.hpp
@@ -0,0 +1,135 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_SCAN_HPP
+#define ACPP_ALGORITHMS_SCAN_HPP
+
+#include "hipSYCL/sycl/event.hpp"
+#include "hipSYCL/sycl/queue.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
+
+#include "decoupled_lookback_scan.hpp"
+#include <type_traits>
+
+namespace hipsycl::algorithms::scanning {
+
+namespace detail {
+
+inline std::size_t select_scan_work_group_size(sycl::queue& q) {
+  std::size_t group_size = 128;
+  if(q.get_device().AdaptiveCpp_device_id().get_backend() == sycl::backend::omp) {
+    group_size = 1024;
+  }
+  return group_size;
+}
+
+}
+
+
+template <bool IsInclusive, class T, class BinaryOp,
+          class OptionalInitT, class Generator, class Processor>
+sycl::event generate_scan_process(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 std::size_t problem_size, BinaryOp op,
+                 OptionalInitT init, Generator gen, Processor processor,
+                 const std::vector<sycl::event> &deps = {}) {
+  
+  std::size_t group_size = detail::select_scan_work_group_size(q);
+
+  return scanning::decoupled_lookback_scan<IsInclusive, T>(
+      q, scratch_allocations, gen, processor, op, problem_size,
+      group_size, init, deps);
+}
+
+template <bool IsInclusive, class InputIt, class OutputIt, class BinaryOp,
+          class OptionalInitT>
+sycl::event scan(sycl::queue &q, util::allocation_group &scratch_allocations,
+                 InputIt first, InputIt last, OutputIt d_first, BinaryOp op,
+                 OptionalInitT init,
+                 const std::vector<sycl::event> &deps = {}) {
+
+  auto generator = [=](auto idx, auto effective_group_id, auto effective_global_id,
+                 auto problem_size) {
+    if(effective_global_id >= problem_size)
+      effective_global_id = problem_size - 1;
+
+    InputIt it = first;
+    std::advance(it, effective_global_id);
+    return *it;
+  };
+  auto result_processor = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size,
+                       auto value) {
+    if (effective_global_id < problem_size) {
+      OutputIt it = d_first;
+      std::advance(it, effective_global_id);
+      *it = value;
+    }
+  };
+
+  std::size_t problem_size = std::distance(first, last);
+  using T = std::decay_t<decltype(*first)>;
+
+  return generate_scan_process<IsInclusive, T>(
+      q, scratch_allocations, problem_size, op, init, generator,
+      result_processor, deps);
+}
+
+template <bool IsInclusive, class InputIt, class OutputIt, class UnaryOp,
+          class BinaryOp, class OptionalInitT>
+sycl::event transform_scan(sycl::queue &q,
+                           util::allocation_group &scratch_allocations,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           UnaryOp unary_op, BinaryOp op, OptionalInitT init,
+                           const std::vector<sycl::event> &deps = {}) {
+
+  using T = std::decay_t<decltype(unary_op(*first))>;
+
+  auto generator = [=](auto idx, auto effective_group_id, auto effective_global_id,
+                 auto problem_size) {
+    if(effective_global_id >= problem_size) {
+      if constexpr(std::is_constructible_v<T>) {
+        return T{};
+      } else {
+        // This might be invalid according to a very strict implementation of C++
+        // definition of e.g. transform_reduce, since it does not guarantee that
+        // unary_op is executed exactly once per element.
+        // However, working around this might be fairly costly in case T is not
+        // default constructible (Idea: Global variable guarded by an atomic lock
+        // which is set by the first thread to have loaded a value), so for
+        // now we do "the simple thing". This is probably still better than
+        // not offloading in that case.
+        return unary_op(*first);
+      }
+    }
+
+    InputIt it = first;
+    std::advance(it, effective_global_id);
+    return unary_op(*it);
+  };
+  auto result_processor = [=](auto idx, auto effective_group_id,
+                       auto effective_global_id, auto problem_size,
+                       auto value) {
+    if (effective_global_id < problem_size) {
+      OutputIt it = d_first;
+      std::advance(it, effective_global_id);
+      *it = value;
+    }
+  };
+
+  std::size_t problem_size = std::distance(first, last);
+  
+  return generate_scan_process<IsInclusive, T>(
+      q, scratch_allocations, problem_size, op, init, generator,
+      result_processor, deps);
+}
+}
+
+#endif
diff --git a/include/hipSYCL/algorithms/sort/bitonic_sort.hpp b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
new file mode 100644
index 000000000..f2518438d
--- /dev/null
+++ b/include/hipSYCL/algorithms/sort/bitonic_sort.hpp
@@ -0,0 +1,110 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_ALGORITHMS_BITONIC_SORT
+#define ACPP_ALGORITHMS_BITONIC_SORT
+
+#include <iterator>
+#include <cstdint>
+#include "hipSYCL/sycl/queue.hpp"
+
+namespace hipsycl::algorithms::sorting {
+
+namespace detail{
+
+
+template<class RandomIt, class Size>
+RandomIt advance_to(RandomIt first, Size i) {
+  std::advance(first, i);
+  return first;
+}
+
+inline bool can_compare(std::size_t left_id, std::size_t right_id,
+                        std::size_t problem_size) {
+
+  return (left_id < right_id) && (left_id < problem_size) &&
+         (right_id < problem_size);
+}
+
+
+} //detail
+
+template <class RandomIt, class SizeT, class Barrier, class Compare>
+void bitonic_group_sort(RandomIt first, SizeT group_size, SizeT problem_size,
+                        SizeT item, Barrier barrier, Compare comp) {
+
+  auto process_pass = [=](SizeT j) {
+    for(SizeT a_id = item; a_id < problem_size; a_id += group_size) {
+      SizeT b_id = a_id ^ j;
+      if(detail::can_compare(a_id, b_id, problem_size)) {
+        auto a = *detail::advance_to(first, a_id);
+        auto b = *detail::advance_to(first, b_id);
+        if(comp(b, a)) {
+          *detail::advance_to(first, a_id) = b;
+          *detail::advance_to(first, b_id) = a;
+        }
+      }
+    }
+    barrier();
+  };
+
+  for(SizeT k = 2; (k >> 1) < problem_size; k *= 2) {
+    process_pass(k-1);
+
+    for (SizeT j = k >> 1; j > 0; j >>= 1) {
+      process_pass(j);
+    }
+  }
+}
+
+template <class RandomIt, class Comparator>
+sycl::event bitonic_sort(sycl::queue &q, RandomIt first, RandomIt last,
+                         Comparator comp, const std::vector<sycl::event>& deps = {}) {
+
+  std::size_t problem_size = std::distance(first, last);
+  sycl::event most_recent_event;
+  bool is_first_kernel = true;
+
+  auto launch_kernel = [&](std::size_t j){
+
+    auto k = [=](sycl::id<1> idx) {
+      std::size_t a_id = idx.get(0);
+      std::size_t b_id = a_id ^ j;
+      if(detail::can_compare(a_id, b_id, problem_size)) {
+        auto a = *detail::advance_to(first, a_id);
+        auto b = *detail::advance_to(first, b_id);
+        if(comp(b, a)) {
+          *detail::advance_to(first, a_id) = b;
+          *detail::advance_to(first, b_id) = a;
+        }
+      }
+    };
+    if(is_first_kernel || q.is_in_order())
+      most_recent_event = q.parallel_for(problem_size, deps, k);
+    else
+      most_recent_event = q.parallel_for(problem_size, most_recent_event, k);
+    is_first_kernel = false;
+  };
+
+  for (std::size_t k = 2; (k >> 1) < problem_size; k *= 2) {
+    launch_kernel(k-1);
+
+    for (std::size_t j = k >> 1; j > 0; j >>= 1) {
+      launch_kernel(j);
+    }
+  }
+
+  return most_recent_event;
+} // bitonic_sort
+
+}
+
+#endif
diff --git a/include/hipSYCL/algorithms/util/allocation_cache.hpp b/include/hipSYCL/algorithms/util/allocation_cache.hpp
index 257db32bd..b92ed32c7 100644
--- a/include/hipSYCL/algorithms/util/allocation_cache.hpp
+++ b/include/hipSYCL/algorithms/util/allocation_cache.hpp
@@ -53,10 +53,10 @@ class allocation_cache {
     std::lock_guard<std::mutex> lock{_mutex};
     
     for(auto& allocation : _allocations) {
-      _rt.get()->backends()
+      auto* allocator = _rt.get()->backends()
           .get(allocation.dev.get_backend())
-          ->get_allocator(allocation.dev)
-          ->free(allocation.ptr);
+          ->get_allocator(allocation.dev);
+      rt::deallocate(allocator, allocation.ptr);
     }
     _allocations.clear();
   }
@@ -74,12 +74,12 @@ class allocation_cache {
                        ->get_allocator(dev);
 
       if(_alloc_type == allocation_type::device)
-        result.ptr = allocator->allocate(min_alignment, min_size);
+        result.ptr = rt::allocate_device(allocator, min_alignment, min_size);
       else if(_alloc_type == allocation_type::shared)
-        result.ptr = allocator->allocate_usm(min_size);
+        result.ptr = rt::allocate_shared(allocator, min_size);
       else
         result.ptr =
-            allocator->allocate_optimized_host(min_alignment, min_size);
+            rt::allocate_host(allocator, min_alignment, min_size);
     }
     return result;
   }
diff --git a/include/hipSYCL/algorithms/util/memory_streaming.hpp b/include/hipSYCL/algorithms/util/memory_streaming.hpp
index 4d0b87512..f87914a03 100644
--- a/include/hipSYCL/algorithms/util/memory_streaming.hpp
+++ b/include/hipSYCL/algorithms/util/memory_streaming.hpp
@@ -15,6 +15,7 @@
 #include "hipSYCL/sycl/device.hpp"
 #include "hipSYCL/sycl/libkernel/nd_item.hpp"
 #include "hipSYCL/sycl/info/device.hpp"
+#include "hipSYCL/sycl/jit.hpp"
 #include <cstddef>
 
 
@@ -62,13 +63,14 @@ class data_streamer {
   static void run(std::size_t problem_size, sycl::nd_item<1> idx,
                   F &&f) noexcept {
     __acpp_if_target_sscp(
-      if(sycl::jit::introspect<sycl::jit::current_backend, int>() == sycl::jit::backend::host) {
-        run_host(problem_size, idx, f);
-      } else {
-        run_device(problem_size, idx, f);
-      }
-      return;
-    );
+        namespace jit = sycl::AdaptiveCpp_jit;
+        jit::compile_if_else(
+            jit::reflect<jit::reflection_query::compiler_backend>() ==
+              jit::compiler_backend::host,
+            [&]() { run_host(problem_size, idx, f); },
+            [&]() { run_device(problem_size, idx, f); });
+
+        return;);
     __acpp_if_target_device(
       run_device(problem_size, idx, f);
     );
diff --git a/include/hipSYCL/common/allocation_map.hpp b/include/hipSYCL/common/allocation_map.hpp
new file mode 100644
index 000000000..90360428e
--- /dev/null
+++ b/include/hipSYCL/common/allocation_map.hpp
@@ -0,0 +1,496 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_ALLOCATION_MAP_HPP
+#define ACPP_ALLOCATION_MAP_HPP
+
+
+#include <cstddef>
+#include <cstdlib>
+#include <cstdint>
+#include <type_traits>
+#include <atomic>
+#include <algorithm>
+#include <set>
+#include <array>
+#include <cassert>
+
+
+namespace hipsycl::common {
+
+struct stdlib_untyped_allocator {
+  static void* allocate(size_t n) {
+    return std::malloc(n);
+  }
+
+  static void deallocate(void* ptr) {
+    std::free(ptr);
+  }
+};
+
+template<class UntypedAllocatorT, class Int_type, int... Bit_sizes>
+class bit_tree {
+protected:
+  bit_tree(){}
+  
+  static constexpr int num_levels = sizeof...(Bit_sizes);
+  static constexpr int root_level_idx = num_levels - 1;
+  static constexpr int bitsizes[num_levels] = {Bit_sizes...};
+
+  static constexpr int get_num_entries_in_level(int level) {
+    return 1ull << bitsizes[level];
+  }
+
+  static constexpr int get_bitoffset_in_level(int level) {
+    int result = 0;
+    for(int i = 0; i < level; ++i) {
+      result += bitsizes[i];
+    }
+    return result;
+  }
+
+  static constexpr int get_index_in_level(Int_type address, int level) {
+    Int_type bitmask = get_n_low_bits_set(bitsizes[level]);
+    return (address >> get_bitoffset_in_level(level)) & bitmask;
+  }
+
+  static constexpr uint64_t get_n_low_bits_set(int n) {
+    if(n == 64)
+      return ~0ull;
+    return (1ull << n) - 1;
+  }
+
+  static constexpr uint64_t get_space_spanned_by_node_in_level(int level) {
+    uint64_t result = 1;
+    for(int i = 0; i < level; ++i)
+      result *= get_num_entries_in_level(level);
+    return result;
+  }
+
+  template<class T>
+  static T* alloc(int count) {
+    return static_cast<T*>(UntypedAllocatorT::allocate(sizeof(T) * count));
+  }
+
+  static void free(void* ptr) {
+    UntypedAllocatorT::deallocate(ptr);
+  }
+};
+
+template<class UntypedAllocatorT>
+using allocation_map_bit_tree_config = bit_tree<UntypedAllocatorT, uint64_t, 
+  4, 4, 4, 4,  4, 4, 4, 4,
+  4, 4, 4, 4,  4, 4, 4, 4>;
+
+template <class UserPayload, class UntypedAllocatorT = stdlib_untyped_allocator>
+class allocation_map : public allocation_map_bit_tree_config<UntypedAllocatorT> {
+public:
+  using bit_tree_t = allocation_map_bit_tree_config<UntypedAllocatorT>;
+
+  static_assert(sizeof(void*) == 8, "Unsupported pointer size");
+  static_assert(std::is_trivial_v<UserPayload>, "UserPayload must be trivial type");
+
+  allocation_map()
+  : _num_in_progress_operations{0} {}
+
+  struct value_type : public UserPayload {
+    std::size_t allocation_size;
+  };
+
+  // Access entry of allocation that address belongs to, or nullptr if the address
+  // does not belong to a known allocation.
+  value_type* get_entry(uint64_t address, uint64_t& root_address) noexcept {
+    insert_or_get_entry_lock lock{_num_in_progress_operations};
+    root_address = 0;
+    int num_leaf_attempts = 0;
+    return get_entry(_root, address, num_leaf_attempts, root_address);
+  }
+
+  // Access entry of allocation that has the given address. Unlike get_entry(),
+  // this does not succeed if the address does not point to the base of the allocation.
+  value_type* get_entry_of_root_address(uint64_t address) noexcept {
+    insert_or_get_entry_lock lock{_num_in_progress_operations};
+    return get_entry_of_root_address(_root, address);
+  }
+
+  // Insert new element. Element's allocation range must be
+  // non-overlapping w.r.t existing entries.
+  // ~0ull is unsupported, because then non-zero allocation
+  // ranges cannot be expressed.
+  bool insert(uint64_t address, const value_type& v) {
+    insert_or_get_entry_lock lock{_num_in_progress_operations};
+    return insert(_root, address, v);
+  }
+
+  bool erase(uint64_t address) {
+    erase_lock lock{_num_in_progress_operations};
+    return erase(_root, address);
+  }
+
+  ~allocation_map() {
+    for (int i = 0;
+         i < this->get_num_entries_in_level(bit_tree_t::root_level_idx); ++i) {
+      auto* ptr = _root.children[i].load(std::memory_order_acquire);
+      if(ptr)
+        release(*ptr);
+    }
+  }
+    
+private:
+  // Useful for debugging/printing
+  template<class F>
+  void with_decomposed_address(uint64_t address, int current_level, F&& handler) {
+    for(int i = this->root_level_idx; i >= current_level; --i) {
+      handler(this->get_index_in_level(address, i));
+    }
+    for(int i = current_level - 1; i >= 0; --i) {
+      handler(-1);
+    }
+  }
+
+  template<class Ostream>
+  void print(Ostream& ostr, uint64_t address, int level) {
+    with_decomposed_address(address, level, [&](int x){
+      if(x >= 0)
+        ostr << x << ".";
+      else
+        ostr << "x";
+    });
+    ostr << "\n";
+  }
+
+  struct leaf_node {
+    leaf_node()
+    : num_entries {} {
+      for(int i = 0; i < bit_tree_t::get_num_entries_in_level(0); ++i) {
+        entries[i].allocation_size = 0;
+      }
+    }
+
+    value_type entries [bit_tree_t::get_num_entries_in_level(0)];
+    std::atomic<int> num_entries;
+  };
+
+  template<int Level>
+  struct intermediate_node {
+  private:
+    static constexpr auto make_child() {
+      if constexpr (Level > 1) return 
+        intermediate_node<Level - 1>{};
+      else return leaf_node{};
+    }
+  public:
+    intermediate_node()
+    : children{}, num_entries{} {}
+
+    using child_type = decltype(make_child());
+
+    std::atomic<child_type*> children [bit_tree_t::get_num_entries_in_level(Level)];
+    std::atomic<int> num_entries;
+  };
+
+  value_type *get_entry(leaf_node &current_node, uint64_t address,
+                        int &/*num_leaf_attempts*/,
+                        uint64_t &root_address) noexcept {
+    int start_address = 0;
+
+    uint64_t max_local_address =
+        root_address | (bit_tree_t::get_num_entries_in_level(0) - 1);
+    
+    if(max_local_address <= address)
+      start_address = bit_tree_t::get_num_entries_in_level(0) - 1;
+    else
+      start_address = bit_tree_t::get_index_in_level(address, 0);
+
+    for (int local_address = start_address; local_address >= 0;
+         --local_address) {
+      
+      auto& element = current_node.entries[local_address];
+
+      std::size_t allocation_size =
+          __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
+      if(allocation_size > 0) {
+
+        uint64_t root_address_candidate =
+            root_address | (static_cast<uint64_t>(local_address)
+                            << bit_tree_t::get_bitoffset_in_level(0));
+
+        uint64_t allocation_end = root_address_candidate + allocation_size;
+        if(address >= root_address_candidate && address < allocation_end) {
+          root_address = root_address_candidate;
+          return &element;
+        } else {
+          return nullptr;
+        }
+        
+      }
+    }
+    return nullptr;
+  }
+
+  template <int Level>
+  value_type *get_entry(intermediate_node<Level> &current_node,
+                        uint64_t address,
+                        int& num_leaf_attempts,
+                        uint64_t& root_address) noexcept {
+    // If the queried address is too close to the next allocation,
+    // it can happen that the search converges on the next allocation.
+    // Therefore, to exclude that case, if a search fails, we also
+    // need to try again with the next allocation before that.
+    // This variable counts how many leaves we have accessed. If it
+    // reaches two, we can abort.
+    if constexpr(Level == bit_tree_t::root_level_idx) {
+      num_leaf_attempts = 0;
+    }
+
+    uint64_t max_local_address =
+        root_address |
+        this->get_n_low_bits_set(bit_tree_t::get_bitoffset_in_level(Level) +
+                                 bit_tree_t::bitsizes[Level]);
+
+    // We are always looking for the next allocation preceding the
+    // current address. If the maximum local address in this node
+    // cannot reach the search address, (e.g. if we are looking in
+    // a preceding node at the same level), we need to start from 
+    // the maximum address. Otherwise, we need to look at the bits
+    // set in this address.
+    int start_address = 0;
+    if(max_local_address <= address)
+      start_address = bit_tree_t::get_num_entries_in_level(Level) - 1;
+    else
+      start_address = bit_tree_t::get_index_in_level(address, Level);
+
+    for (int local_address = start_address;
+         local_address >= 0; --local_address) {
+      
+      auto *ptr = current_node.children[local_address].load(
+          std::memory_order_acquire);
+      
+      if(ptr) {
+        uint64_t root_address_candidate =
+            root_address | (static_cast<uint64_t>(local_address)
+                            << bit_tree_t::get_bitoffset_in_level(Level));
+
+        auto* ret = get_entry(*ptr, address, num_leaf_attempts,
+                              root_address_candidate);
+        // If we are in level 1, ret refers to a leaf node
+        if constexpr(Level == 1) {
+          ++num_leaf_attempts;
+        }
+
+        if(ret) {
+          root_address = root_address_candidate;
+          return ret;
+        } else if(num_leaf_attempts >= 2) {
+          // We can abort if we have looked at the first hit leaf node,
+          // and the one before that.
+          return nullptr;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  value_type *get_entry_of_root_address(leaf_node &current_node, uint64_t address) noexcept {
+    int local_address = bit_tree_t::get_index_in_level(address, 0);
+  
+    auto& element = current_node.entries[local_address];
+    std::size_t allocation_size =
+        __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
+
+    if (allocation_size > 0) {
+      return &element;
+    }
+
+    return nullptr;
+  }
+
+  template <int Level>
+  value_type *get_entry_of_root_address(intermediate_node<Level> &current_node,
+                                        uint64_t address) noexcept {
+    int local_address = bit_tree_t::get_index_in_level(address, Level);
+  
+    auto *ptr = current_node.children[local_address].load(
+          std::memory_order_acquire);
+      
+    if(ptr) {
+      return get_entry_of_root_address(*ptr, address);
+    }
+    return nullptr;
+  }
+
+  bool insert(leaf_node &current_node, uint64_t address, const value_type &v) {
+
+    int local_address = bit_tree_t::get_index_in_level(address, 0);
+
+    std::size_t *allocation_size_ptr =
+        &(current_node.entries[local_address].allocation_size);
+
+    std::size_t allocation_size = __atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE);
+    if(allocation_size > 0) {
+      // Entry is already occupied
+      return false;
+    }
+    
+    __atomic_store_n(allocation_size_ptr, v.allocation_size, __ATOMIC_RELEASE);
+    current_node.entries[local_address].UserPayload::operator=(v);
+    
+    current_node.num_entries.fetch_add(
+        1, std::memory_order_acq_rel);
+
+    return true;
+  }
+
+  template <int Level>
+  bool insert(intermediate_node<Level> &current_node, uint64_t address,
+              const value_type &v) {
+    using child_t = typename intermediate_node<Level>::child_type;
+
+    int local_address = bit_tree_t::get_index_in_level(address, Level);
+    
+    auto *ptr = current_node.children[local_address].load(
+        std::memory_order_acquire);
+    
+    if(!ptr) {
+      child_t* new_child = this->template alloc<child_t>(1);
+      new (new_child) child_t{};
+
+      if (!current_node.children[local_address].compare_exchange_strong(
+              ptr /* == nullptr*/, new_child, std::memory_order_acq_rel)) {
+        // Assigning new child has failed because child is no longer nullptr
+        // -> free new child again
+        destroy(*new_child);
+        this->free(new_child);
+      } else {
+        current_node.num_entries.fetch_add(
+            1, std::memory_order_acq_rel);
+        ptr = new_child;
+      }
+    }
+
+    return insert(*ptr, address, v);
+  }
+
+  bool erase(leaf_node& current_node, uint64_t address) {
+    int local_address = bit_tree_t::get_index_in_level(address, 0);
+
+    std::size_t *allocation_size_ptr =
+        &(current_node.entries[local_address].allocation_size);
+    // Entry was already deleted or does not exist
+    if(__atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE) == 0)
+      return false;
+
+    __atomic_store_n(allocation_size_ptr, 0, __ATOMIC_RELEASE);
+
+    current_node.num_entries.fetch_sub(
+        1, std::memory_order_acq_rel);
+    
+    return true;
+  }
+
+  template <int Level>
+  bool erase(intermediate_node<Level> &current_node, uint64_t address) {
+
+    int local_address = bit_tree_t::get_index_in_level(address, Level);
+    auto *ptr = current_node.children[local_address].load(
+        std::memory_order_acquire);
+    if(!ptr)
+      return false;
+    
+    bool result = erase(*ptr, address);
+    if(result) {
+      if(ptr->num_entries.load(std::memory_order_acquire) == 0) {
+        auto *current_ptr = current_node.children[local_address].exchange(
+            nullptr, std::memory_order_acq_rel);
+        // TODO: We could potentially get erase() lock-free
+        // by counting by how many ops each node is currently used,
+        // and waiting here until the count turns to 0.
+        if(current_ptr) {
+          destroy(*current_ptr);
+          this->free(current_ptr);
+          current_node.num_entries.fetch_sub(
+              1, std::memory_order_acq_rel);
+        }
+      }
+    }
+    return result;
+  }
+
+  void release(leaf_node& current_node) {
+    destroy(current_node);
+  }
+
+  template<int Level>
+  void release(intermediate_node<Level>& current_node) {
+    for(int i = 0; i < bit_tree_t::get_num_entries_in_level(Level); ++i){
+      if (auto *ptr = current_node.children[i].load(
+              std::memory_order_acquire)) {
+        release(*ptr);
+        this->free(ptr);
+      }
+    }
+    destroy(current_node);
+  }
+
+  void destroy(leaf_node& node) {
+    node.~leaf_node();
+  }
+
+  template<int Level>
+  void destroy(intermediate_node<Level>& node) {
+    node.~intermediate_node<Level>();
+  }
+
+  struct erase_lock {
+  public:
+    erase_lock(std::atomic<int>& op_counter)
+    : _op_counter{op_counter} {
+      int expected = 0;
+      while (!_op_counter.compare_exchange_strong(
+          expected, -1, std::memory_order_release, std::memory_order_relaxed)) {
+        expected = 0;
+      }
+    }
+
+    ~erase_lock() {
+      _op_counter.store(0, std::memory_order_release);
+    }
+  private:
+    std::atomic<int>& _op_counter;
+  };
+
+  struct insert_or_get_entry_lock {
+  public:
+    insert_or_get_entry_lock(std::atomic<int>& op_counter)
+    : _op_counter{op_counter} {
+      int expected = std::max(0, _op_counter.load(std::memory_order_acquire));
+      while (!_op_counter.compare_exchange_strong(
+          expected, expected+1, std::memory_order_release,
+          std::memory_order_relaxed)) {
+        if(expected < 0)
+          expected = 0;
+      }
+    }
+
+    ~insert_or_get_entry_lock() {
+      _op_counter.fetch_sub(1, std::memory_order_acq_rel);
+    }
+  private:
+   std::atomic<int>& _op_counter;
+  };
+
+  intermediate_node<bit_tree_t::root_level_idx> _root;
+  std::atomic<int> _num_in_progress_operations;
+};
+
+
+}
+
+#endif
diff --git a/include/hipSYCL/common/debug.hpp b/include/hipSYCL/common/debug.hpp
index 3f2c92aa6..e393e2840 100644
--- a/include/hipSYCL/common/debug.hpp
+++ b/include/hipSYCL/common/debug.hpp
@@ -45,7 +45,7 @@ class output_stream {
 private:
 
   output_stream()
-  : _debug_level {HIPSYCL_DEBUG_LEVEL}, _output_stream{std::cout} {
+  : _debug_level {HIPSYCL_DEBUG_LEVEL}, _output_stream{std::cerr} {
 #if !defined(HIPSYCL_COMPILER_COMPONENT) && !defined(HIPSYCL_TOOL_COMPONENT)
     _debug_level =
         rt::application::get_settings().get<rt::setting::debug_level>();
diff --git a/include/hipSYCL/compiler/cbs/IRUtils.hpp b/include/hipSYCL/compiler/cbs/IRUtils.hpp
index ebbd8520d..2b25eb502 100644
--- a/include/hipSYCL/compiler/cbs/IRUtils.hpp
+++ b/include/hipSYCL/compiler/cbs/IRUtils.hpp
@@ -15,6 +15,7 @@
 
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/IR/Constants.h>
+#include <llvm/IR/Module.h>
 
 namespace llvm {
 class Region;
@@ -58,6 +59,7 @@ static const std::array<const char *, 3> NumGroupsGlobalNames{
     NumGroupsGlobalNameX, NumGroupsGlobalNameY, NumGroupsGlobalNameZ};
 
 static constexpr const char SscpDynamicLocalMemoryPtrName[] = "__acpp_cbs_sscp_dynamic_local_memory";
+static constexpr const char SscpInternalLocalMemoryPtrName[] = "__acpp_cbs_sscp_internal_local_memory";
 } // namespace cbs
 
 static constexpr const char SscpAnnotationsName[] = "hipsycl.sscp.annotations";
@@ -78,6 +80,9 @@ template <class PtrSet> struct PtrSetWrapper {
   auto begin() -> decltype(Set.begin()) { return Set.begin(); }
 };
 
+
+void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm::Value *To, llvm::StringRef LogPrefix = "");
+
 llvm::Loop *updateDtAndLi(llvm::LoopInfo &LI, llvm::DominatorTree &DT, const llvm::BasicBlock *B,
                           llvm::Function &F);
 
diff --git a/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp b/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp
index fdee51bfa..037f4fd42 100644
--- a/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp
+++ b/include/hipSYCL/compiler/cbs/VectorShapeTransformer.hpp
@@ -18,6 +18,7 @@
 #include <llvm/Analysis/LoopInfo.h>
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Value.h>
+#include <llvm/IR/DataLayout.h>
 
 namespace hipsycl::compiler {
 using SmallValVec = llvm::SmallVector<const llvm::Value *, 2>;
diff --git a/include/hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp b/include/hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp
new file mode 100644
index 000000000..2e03629ea
--- /dev/null
+++ b/include/hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp
@@ -0,0 +1,34 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_KNOWN_PTR_PARAM_ALIGNMENT_OPT_PASS_HPP
+#define HIPSYCL_SSCP_KNOWN_PTR_PARAM_ALIGNMENT_OPT_PASS_HPP
+
+#include <llvm/IR/PassManager.h>
+#include <unordered_map>
+
+namespace hipsycl {
+namespace compiler {
+
+class KnownPtrParamAlignmentOptPass : public llvm::PassInfoMixin<KnownPtrParamAlignmentOptPass> {
+public:
+  KnownPtrParamAlignmentOptPass(
+      const std::unordered_map<std::string, std::vector<std::pair<int, int>>> &KnownAlignments);
+  llvm::PreservedAnalyses run(llvm::Module &M,
+                              llvm::ModuleAnalysisManager &MAM);
+private:
+  std::unordered_map<std::string, std::vector<std::pair<int, int>>> KnownPtrParamAlignments;
+};
+
+}
+}
+
+#endif
+
diff --git a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
index e530cd13e..e987aea1e 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp
@@ -23,7 +23,7 @@
 #include <typeinfo>
 #include <functional>
 #include "AddressSpaceMap.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/runtime/util.hpp"
 
 namespace llvm {
@@ -50,29 +50,15 @@ class LLVMToBackendTranslator {
 
   virtual ~LLVMToBackendTranslator() {}
 
-  // Do not use inside llvm-to-backend infrastructure targets to avoid
-  // requiring RTTI-enabled LLVM
-  template<auto& ConstantName, class T>
-  void setS2IRConstant(const T& value) {
-    static_assert(std::is_integral_v<T> || std::is_floating_point_v<T>,
-                  "Unsupported type for S2 IR constant");
-
-    std::string name = typeid(__acpp_sscp_s2_ir_constant<ConstantName, T>).name();
-    setS2IRConstant<T>(name, value);
-  }
-
-  template<class T>
-  void setS2IRConstant(const std::string& name, T value) {
-    setS2IRConstant(name, static_cast<const void*>(&value));
-  }
-
-  void setS2IRConstant(const std::string& name, const void* ValueBuffer);
+  void setNoAliasKernelParam(const std::string& KernelName, int ParamIndex);
   void specializeKernelArgument(const std::string &KernelName, int ParamIndex,
                                 const void *ValueBuffer);
   void specializeFunctionCalls(const std::string &FuncName,
                              const std::vector<std::string> &ReplacementCalls,
                              bool OverrideOnlyUndefined=true);
 
+  void setKnownPtrParamAlignment(const std::string &FunctionName, int ParamIndex, int Alignment);
+
   bool setBuildFlag(const std::string &Flag);
   bool setBuildOption(const std::string &Option, const std::string &Value);
   bool setBuildToolArguments(const std::string &ToolName, const std::vector<std::string> &Args);
@@ -82,6 +68,8 @@ class LLVMToBackendTranslator {
     return setBuildOption(Option, std::to_string(Value));
   }
 
+  void setReflectionField(const std::string& name, uint64_t value);
+
   // Does partial transformation to backend-flavored LLVM IR
   bool partialTransformation(const std::string& LLVMIR, std::string& out);
 
@@ -90,7 +78,6 @@ class LLVMToBackendTranslator {
   bool prepareIR(llvm::Module& M);
   bool translatePreparedIR(llvm::Module& FlavoredModule, std::string& out);
 
-
   const std::vector<std::string>& getErrorLog() const {
     return Errors;
   }
@@ -229,12 +216,18 @@ class LLVMToBackendTranslator {
   void runKernelDeadArgumentElimination(llvm::Module &M, llvm::Function *F, PassHandler &PH,
                                         std::vector<int>& RetainedIndicesOut);
 
+  std::string getCompilationIdentifier() const;
+
   int S2IRConstantBackendId;
   
   std::vector<std::string> OutliningEntrypoints;
+  // function call specializations might result in additional outlining entrypoints
+  // that we need to consider early on
+  std::vector<std::string> FunctionCallSpecializationOutliningEntrypoints;
   std::vector<std::string> Kernels;
 
   std::vector<std::string> Errors;
+  
   std::unordered_map<std::string, std::function<void(llvm::Module &)>> SpecializationApplicators;
   ExternalSymbolResolver SymbolResolver;
   bool HasExternalSymbolResolver = false;
@@ -243,6 +236,11 @@ class LLVMToBackendTranslator {
   std::string ErroringCode;
 
   std::vector<std::pair<std::string, std::vector<int>*>> FunctionsForDeadArgumentElimination;
+  std::unordered_map<std::string, std::vector<int>> NoAliasParameters;
+
+  // map from kernel name to list of (param index, alignment)
+  std::unordered_map<std::string, std::vector<std::pair<int, int>>> KnownPtrParamAlignments;
+  std::unordered_map<std::string, uint64_t> ReflectionFields;
 
 };
 
diff --git a/include/hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp b/include/hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp
new file mode 100644
index 000000000..bb97531f0
--- /dev/null
+++ b/include/hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp
@@ -0,0 +1,39 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_S2_REFLECTION_HPP
+#define ACPP_S2_REFLECTION_HPP
+
+#include <llvm/IR/PassManager.h>
+#include <unordered_map>
+#include <string>
+#include <cstdint>
+
+namespace hipsycl {
+namespace compiler {
+
+// Processes calls to 
+// - __acpp_jit_reflect_<name> or __acpp_s2_reflect_<name> functions (different synonyms),
+// replacing callsites with provided constants.
+// - __acpp_jit_reflect_knows_<name> or __acpp_s2_reflect_knows_<>
+class ProcessS2ReflectionPass : public llvm::PassInfoMixin<ProcessS2ReflectionPass> {
+public:
+  ProcessS2ReflectionPass(const std::unordered_map<std::string, uint64_t>& Fields);
+  llvm::PreservedAnalyses run(llvm::Module &M,
+                              llvm::ModuleAnalysisManager &MAM);
+private:
+  std::unordered_map<std::string, uint64_t> SupportedFields;
+};
+
+}
+}
+
+#endif
+
diff --git a/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp b/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp
index 9e5edab37..883358be9 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/Utils.hpp
@@ -70,7 +70,7 @@ inline llvm::Error loadModuleFromString(const std::string &LLVMIR, llvm::LLVMCon
 }
 
 template<class F>
-inline void constructPassBuilder(F&& handler) {
+inline auto withPassBuilder(F&& handler) {
   llvm::LoopAnalysisManager LAM;
   llvm::FunctionAnalysisManager FAM;
   llvm::CGSCCAnalysisManager CGAM;
@@ -82,11 +82,11 @@ inline void constructPassBuilder(F&& handler) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  handler(PB, LAM, FAM, CGAM, MAM);
+  return handler(PB, LAM, FAM, CGAM, MAM);
 }
 
 template<class F>
-inline void constructPassBuilderAndMAM(F&& handler) {
+inline auto withPassBuilderAndMAM(F&& handler) {
   llvm::LoopAnalysisManager LAM;
   llvm::FunctionAnalysisManager FAM;
   llvm::CGSCCAnalysisManager CGAM;
@@ -98,7 +98,7 @@ inline void constructPassBuilderAndMAM(F&& handler) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  handler(PB, MAM);
+  return handler(PB, MAM);
 }
 
 
diff --git a/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp b/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp
index 9979ea10a..de544ee9b 100644
--- a/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp
+++ b/include/hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp
@@ -18,9 +18,13 @@ namespace compiler {
 
 class HostKernelWrapperPass : public llvm::PassInfoMixin<HostKernelWrapperPass> {
   std::int64_t DynamicLocalMemSize;
+  std::array<int, 3> KnownWgSize;
+
 public:
-  explicit HostKernelWrapperPass(std::int64_t DynamicLocalMemSize)
-      : DynamicLocalMemSize{DynamicLocalMemSize} {}
+  explicit HostKernelWrapperPass(std::int64_t DynamicLocalMemSize, int KnownGroupSizeX,
+                                 int KnownGroupSizeY, int KnownGroupSizeZ)
+      : DynamicLocalMemSize{DynamicLocalMemSize},
+        KnownWgSize{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ} {}
 
   llvm::PreservedAnalyses run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
   static bool isRequired() { return true; }
diff --git a/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp b/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp
index f1ecca091..9f44b4e06 100644
--- a/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp
+++ b/include/hipSYCL/compiler/sscp/KernelOutliningPass.hpp
@@ -20,6 +20,8 @@ namespace compiler {
 
 class EntrypointPreparationPass : public llvm::PassInfoMixin<EntrypointPreparationPass> {
 public:
+  EntrypointPreparationPass(bool ExportByDefault = false);
+
   llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
 
   const std::vector<std::string>& getKernelNames() const {
@@ -38,6 +40,7 @@ class EntrypointPreparationPass : public llvm::PassInfoMixin<EntrypointPreparati
   std::vector<std::string> KernelNames;
   std::vector<std::string> OutliningEntrypoints;
   std::vector<std::string> NonKernelOutliningEntrypoints;
+  bool ExportAll;
 };
 
 //  Removes all code not belonging to kernels
diff --git a/include/hipSYCL/compiler/utils/LLVMUtils.hpp b/include/hipSYCL/compiler/utils/LLVMUtils.hpp
index 9cc28ab4e..4e1fc02cb 100644
--- a/include/hipSYCL/compiler/utils/LLVMUtils.hpp
+++ b/include/hipSYCL/compiler/utils/LLVMUtils.hpp
@@ -11,7 +11,7 @@
 #ifndef HIPSYCL_LLVMUTILS_HPP
 #define HIPSYCL_LLVMUTILS_HPP
 
-
+#include <llvm/ADT/StringRef.h>
 #if LLVM_VERSION_MAJOR < 16
 #define IS_OPAQUE(pointer) (pointer->isOpaquePointerTy())
 #define HAS_TYPED_PTR 1
@@ -20,4 +20,23 @@
 #define HAS_TYPED_PTR 0
 #endif
 
+namespace hipsycl::llvmutils {
+
+  inline bool starts_with(llvm::StringRef String, llvm::StringRef Prefix) {
+#if LLVM_VERSION_MAJOR < 18
+    return String.startswith(Prefix);
+#else
+    return String.starts_with(Prefix);
+#endif
+  }
+
+  inline bool ends_with(llvm::StringRef String, llvm::StringRef Prefix) {
+#if LLVM_VERSION_MAJOR < 18
+    return String.endswith(Prefix);
+#else
+    return String.ends_with(Prefix);
+#endif
+  }
+}// namespace hipsycl::llvmutils
+
 #endif // HIPSYCL_LLVMUTILS_HPP
diff --git a/include/hipSYCL/glue/error.hpp b/include/hipSYCL/glue/error.hpp
index a49ad4699..377d79c95 100644
--- a/include/hipSYCL/glue/error.hpp
+++ b/include/hipSYCL/glue/error.hpp
@@ -28,11 +28,11 @@ namespace glue {
 inline void print_async_errors(sycl::exception_list error_list) {
   if (error_list.size() > 0) {
     std::ostream& output_stream = common::output_stream::get().get_stream();
-    output_stream << "============== hipSYCL error report ============== "
+    output_stream << "============== AdaptiveCpp error report ============== "
                   << std::endl;
 
     output_stream
-        << "hipSYCL has caught the following unhandled asynchronous errors: "
+        << "AdaptiveCpp has caught the following unhandled asynchronous errors: "
         << std::endl << std::endl;
 
     int idx = 0;
diff --git a/include/hipSYCL/glue/kernel_launcher_data.hpp b/include/hipSYCL/glue/kernel_launcher_data.hpp
index 145e0a8ad..cb747c0bb 100644
--- a/include/hipSYCL/glue/kernel_launcher_data.hpp
+++ b/include/hipSYCL/glue/kernel_launcher_data.hpp
@@ -31,6 +31,7 @@ class dag_node;
 class kernel_configuration;
 class backend_kernel_launch_capabilities;
 class hcf_kernel_info;
+class kernel_operation;
 }
 
 namespace glue {
@@ -53,7 +54,7 @@ struct kernel_launcher_data {
   rt::range<3> group_size; // <- indices must be flipped
   unsigned local_mem_size;
   // In case the launch is a custom operation
-  std::function<void(sycl::interop_handle&)> custom_op;
+  std::function<void(rt::kernel_operation*, sycl::interop_handle&)> custom_op;
 
   using invoker_function_t = rt::result (*)(
       const kernel_launcher_data &launch_config, rt::dag_node *node,
diff --git a/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp b/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp
index 2ff117c05..e72e1ca3e 100644
--- a/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/ir_constants.hpp
@@ -11,37 +11,6 @@
 #ifndef HIPSYCL_IR_CONSTANTS_HPP
 #define HIPSYCL_IR_CONSTANTS_HPP
 
-#include <type_traits>
-
 #include "s1_ir_constants.hpp"
-#include "s2_ir_constants.hpp"
-
-template <auto &ConstantName, class ValueT>
-ValueT __acpp_sscp_s2_ir_constant<ConstantName, ValueT>::get(
-    ValueT default_value) noexcept {
-  // The static variable will cause clang to emit a global variable in LLVM IR,
-  // that we will turn into a constant during S2 compilation.
-  //
-  // TODO We may have to suppress compiler warnings about uninitialized data
-  // here
-  //
-  // S2 Compiler will look for special identifier __acpp_ir_constant_v to
-  // distinguish the actual IR constant from other global variables related to
-  // this class (e.g. type information).
-  static ValueT __acpp_ir_constant_v;
-  if (__acpp_sscp_is_device) {
-    return __acpp_ir_constant_v;
-  } else {
-    return default_value;
-  }
-}
-
-namespace hipsycl::sycl::jit {
-
-template <auto &ConstantName, class ValueT>
-auto introspect(ValueT default_value = {}) noexcept {
-  return __acpp_sscp_s2_ir_constant<ConstantName, ValueT>::get(default_value);
-}
-}
 
 #endif
diff --git a/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp b/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
new file mode 100644
index 000000000..f366e725c
--- /dev/null
+++ b/include/hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp
@@ -0,0 +1,118 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_GLUE_JIT_REFLECTION_QUERIES_HPP
+#define ACPP_GLUE_JIT_REFLECTION_QUERIES_HPP
+
+
+namespace hipsycl{
+namespace sycl {
+namespace AdaptiveCpp_jit {
+
+enum class compiler_backend : int {
+  spirv = 0,
+  ptx = 1,
+  amdgpu = 2,
+  host = 3
+};
+
+namespace vendor_id {
+
+inline constexpr int nvidia = 4318;
+inline constexpr int amd = 1022;
+inline constexpr int intel = 8086;
+
+}
+
+}
+}
+}
+
+
+
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_vendor_id();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_arch();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_has_independent_forward_progress();
+extern "C" bool __acpp_sscp_jit_reflect_knows_runtime_backend();
+extern "C" bool __acpp_sscp_jit_reflect_knows_compiler_backend();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_is_cpu();
+extern "C" bool __acpp_sscp_jit_reflect_knows_target_is_gpu();
+
+extern "C" int __acpp_sscp_jit_reflect_target_vendor_id();
+extern "C" int __acpp_sscp_jit_reflect_target_arch();
+extern "C" bool __acpp_sscp_jit_reflect_target_is_cpu();
+extern "C" bool __acpp_sscp_jit_reflect_target_is_gpu();
+extern "C" bool __acpp_sscp_jit_reflect_target_has_independent_forward_progress();
+extern "C" int __acpp_sscp_jit_reflect_runtime_backend();
+extern "C" hipsycl::sycl::AdaptiveCpp_jit::compiler_backend __acpp_sscp_jit_reflect_compiler_backend();
+
+namespace hipsycl {
+namespace sycl {
+namespace AdaptiveCpp_jit {
+
+
+namespace reflection_query {
+
+#define ACPP_DEFINE_REFLECT_QUERY(name)                                        \
+  struct name {                                                                \
+    __attribute__((always_inline)) static bool is_known() {                    \
+      return __acpp_sscp_jit_reflect_knows_##name();                           \
+    }                                                                          \
+    __attribute__((always_inline)) static auto get() {                         \
+      return __acpp_sscp_jit_reflect_##name();                                 \
+    }                                                                          \
+  };
+
+ACPP_DEFINE_REFLECT_QUERY(target_vendor_id)
+ACPP_DEFINE_REFLECT_QUERY(target_arch)
+ACPP_DEFINE_REFLECT_QUERY(target_has_independent_forward_progress)
+ACPP_DEFINE_REFLECT_QUERY(target_is_cpu)
+ACPP_DEFINE_REFLECT_QUERY(target_is_gpu)
+ACPP_DEFINE_REFLECT_QUERY(runtime_backend)
+ACPP_DEFINE_REFLECT_QUERY(compiler_backend)
+
+#undef ACPP_DEFINE_REFLECT_QUERY
+
+}
+
+template<class Query>
+__attribute__((always_inline))
+auto reflect() {
+  return Query::get();
+}
+
+template<class Query>
+__attribute__((always_inline))
+bool knows() {
+  return Query::is_known();
+}
+
+template<class F>
+__attribute__((always_inline))
+void compile_if(bool condition, F&& f) {
+  if(condition) {
+    f();
+  }
+}
+
+template<class F, class G>
+__attribute__((always_inline))
+auto compile_if_else(bool condition, F&& if_branch, G&& else_branch) {
+  if(condition) {
+    return if_branch();
+  } else{
+    return else_branch();
+  }
+}
+
+}
+}
+}
+#endif
diff --git a/include/hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp b/include/hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp
new file mode 100644
index 000000000..1f471677f
--- /dev/null
+++ b/include/hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp
@@ -0,0 +1,44 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_GLUE_JIT_REFLECTION_MAP_HPP
+#define ACPP_GLUE_JIT_REFLECTION_MAP_HPP
+
+#include <unordered_map>
+#include <string>
+#include <cstdint>
+
+#include "hipSYCL/runtime/hardware.hpp"
+
+namespace hipsycl{
+namespace glue {
+namespace jit {
+
+using reflection_map = std::unordered_map<std::string, uint64_t>;
+
+inline reflection_map construct_default_reflection_map(rt::hardware_context* ctx) {
+  reflection_map rmap;
+  rmap["target_vendor_id"] = ctx->get_property(rt::device_uint_property::vendor_id);
+  rmap["target_has_independent_forward_progress"] = static_cast<uint64_t>(ctx->has(
+      rt::device_support_aspect::work_item_independent_forward_progress));
+  rmap["target_arch"] = ctx->get_property(rt::device_uint_property::architecture);
+  rmap["target_is_gpu"] = ctx->is_gpu() ? 1 : 0;
+  rmap["target_is_cpu"] = ctx->is_cpu() ? 1 : 0;
+
+  rmap["runtime_backend"] = ctx->get_property(rt::device_uint_property::backend_id);
+  // compiler_backend is set by the LLVMToBackend infrastructure.
+  return rmap;
+}
+
+}
+}
+}
+
+#endif
diff --git a/include/hipSYCL/glue/llvm-sscp/jit.hpp b/include/hipSYCL/glue/llvm-sscp/jit.hpp
index d8dc5ea8d..6d4552fa2 100644
--- a/include/hipSYCL/glue/llvm-sscp/jit.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/jit.hpp
@@ -21,6 +21,7 @@
 #include "hipSYCL/runtime/kernel_cache.hpp"
 #include "hipSYCL/runtime/kernel_configuration.hpp"
 #include "hipSYCL/runtime/application.hpp"
+#include "jit-reflection/reflection_map.hpp"
 #include <cstddef>
 #include <vector>
 #include <atomic>
@@ -230,28 +231,45 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
                           const std::string &source,
                           const rt::kernel_configuration &config,
                           const symbol_list_t& imported_symbol_names,
+                          const reflection_map& refl_map,
                           std::string &output) {
 
   assert(translator);
   runtime_linker configure_linker {translator, imported_symbol_names};
 
   // Apply configuration
-  translator->setS2IRConstant<sycl::jit::current_backend, int>(
-      translator->getBackendId());
-  for(const auto& entry : config.s2_ir_entries()) {
-    translator->setS2IRConstant(entry.get_name(), entry.get_data_buffer());
-  }
   if(translator->getKernels().size() == 1) {
     // Currently we only can specialize kernel arguments for the 
     // single-kernel code object model
+    HIPSYCL_DEBUG_INFO << "jit: Configuring kernel "
+                       << translator->getKernels()[0] << std::endl;
     for(const auto& entry : config.specialized_arguments()) {
+      HIPSYCL_DEBUG_INFO << "jit: Specializing argument " << entry.first
+                         << " = " << entry.second << std::endl;
       translator->specializeKernelArgument(translator->getKernels().front(),
                                           entry.first, &entry.second);
     }
+
+    int num_param_indices = static_cast<int>(config.get_num_kernel_param_indices());
+    for (int i = 0; i < num_param_indices; ++i) {
+      if (config.has_kernel_param_flag(i, rt::kernel_param_flag::noalias)) {
+        HIPSYCL_DEBUG_INFO << "jit: Setting argument " << i << " to noalias"
+                           << std::endl;
+        translator->setNoAliasKernelParam(translator->getKernels().front(), i);
+      }
+    }
+    for(const auto& entry : config.known_alignments()) {
+      HIPSYCL_DEBUG_INFO << "jit: Setting argument " << entry.first
+                         << " to alignment " << entry.second << std::endl;
+      translator->setKnownPtrParamAlignment(translator->getKernels().front(),
+                                            entry.first, entry.second);
+    }
   }
   for(const auto& entry : config.function_call_specialization_config()) {
     auto& config = entry.value->function_call_map;
     for(const auto& call_specialization : config) {
+      HIPSYCL_DEBUG_INFO << "jit: Specializing function call to "
+                         << call_specialization.first << std::endl;
       translator->specializeFunctionCalls(call_specialization.first,
                                           call_specialization.second, false);
     }
@@ -271,6 +289,11 @@ inline rt::result compile(compiler::LLVMToBackendTranslator *translator,
     translator->setBuildFlag(rt::to_string(flag));
   }
 
+  // Set up JIT-time reflection for the code we compile
+  for(const auto& KV : refl_map) {
+    translator->setReflectionField(KV.first, KV.second);
+  }
+
   // Transform code
   if(!translator->fullTransformation(source, output)) {
     // In case of failure, if a dump directory for IR is set,
@@ -307,6 +330,7 @@ inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
                           const common::hcf_container* hcf,
                           const std::string& image_name,
                           const rt::kernel_configuration &config,
+                          const reflection_map& refl_map,
                           std::string &output) {
   assert(hcf);
   assert(hcf->root_node());
@@ -346,13 +370,15 @@ inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
   symbol_list_t imported_symbol_names =
       target_image_node->get_as_list("imported-symbols");
 
-  return compile(translator, source, config, imported_symbol_names, output);
+  return compile(translator, source, config, imported_symbol_names, refl_map,
+                 output);
 }
 
 inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
                           rt::hcf_object_id hcf_object,
                           const std::string& image_name,
                           const rt::kernel_configuration &config,
+                          const reflection_map& refl_map,
                           std::string &output) {
   const common::hcf_container* hcf = rt::hcf_cache::get().get_hcf(hcf_object);
   if(!hcf) {
@@ -362,17 +388,20 @@ inline rt::result compile(compiler::LLVMToBackendTranslator* translator,
   }
 
   return compile(translator, hcf, image_name, config,
-                 output);
+                 refl_map, output);
 }
 
 namespace dead_argument_elimination {
 // Compiles with dead-argument-elimination for the kernels, and saves
 // the retained argument mask in the appdb. This only works for single-kernel
 // compilations!
-inline rt::result compile_kernel(
-    compiler::LLVMToBackendTranslator *translator, rt::hcf_object_id hcf_object,
-    const std::string &image_name, const rt::kernel_configuration &config,
-    rt::kernel_configuration::id_type binary_id, std::string &output) {
+inline rt::result compile_kernel(compiler::LLVMToBackendTranslator *translator,
+                                 rt::hcf_object_id hcf_object,
+                                 const std::string &image_name,
+                                 const rt::kernel_configuration &config,
+                                 rt::kernel_configuration::id_type binary_id,
+                                 const reflection_map &refl_map,
+                                 std::string &output) {
 
   assert(translator->getKernels().size() == 1);
 
@@ -386,7 +415,8 @@ inline rt::result compile_kernel(
 
         translator->enableDeadArgumentElminiation(translator->getKernels()[0],
                                                   retained_args);
-        err = compile(translator, hcf_object, image_name, config, output);
+        err = compile(translator, hcf_object, image_name, config, refl_map,
+                      output);
       });
 
   return err;
diff --git a/include/hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp b/include/hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp
deleted file mode 100644
index c1dd14b95..000000000
--- a/include/hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
- * parallelism for CPUs and GPUs.
- *
- * Copyright The AdaptiveCpp Contributors
- *
- * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
- * See file LICENSE in the project root for full license details.
- */
-// SPDX-License-Identifier: BSD-2-Clause
-#ifndef HIPSYCL_S2_IR_CONSTANTS_HPP
-#define HIPSYCL_S2_IR_CONSTANTS_HPP
-
-/// \brief This file contains S2 IR constant definitions that may
-/// be shared across the hipSYCL compiler code. 
-///
-/// As such, no undefined globals should be pulled into this file.
-///
-/// Unlike Stage 1 IR constants, Stage 2 IR constants can be constructed
-/// programmatically by the user.
-
-// S2 IR constants can be identified from their usage of
-// __acpp_sscp_s2_ir_constant
-template<auto& ConstantName, class ValueT>
-struct __acpp_sscp_s2_ir_constant {
-  static ValueT get(ValueT default_value) noexcept;
-
-  using value_type = ValueT;
-};
-
-
-namespace hipsycl::glue::sscp {
-  struct ir_constant_name {};
-}
-
-namespace hipsycl::sycl::jit {
-
-namespace backend {
-
-inline constexpr int spirv = 0;
-inline constexpr int ptx = 1;
-inline constexpr int amdgpu = 2;
-inline constexpr int host = 3;
-
-}
-
-constexpr glue::sscp::ir_constant_name current_backend;
-
-}
-
-#endif
diff --git a/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp b/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp
index 4ec4ef796..fb202ee83 100644
--- a/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp
+++ b/include/hipSYCL/glue/llvm-sscp/sscp_kernel_launcher.hpp
@@ -295,7 +295,12 @@ class sscp_kernel_launcher
       
     } else if constexpr (type == rt::kernel_type::custom) {
       // handled at invoke time
-      data.custom_op = k;
+      data.custom_op = [k](rt::kernel_operation * kernel_op, sycl::interop_handle& ih) mutable {
+          kernel_op->initialize_embedded_pointers(
+              static_cast<void*>(&k),
+              sizeof(Kernel));
+          k(ih);
+       };
     }
     else {
       assert(false && "Unsupported kernel type");
@@ -313,8 +318,9 @@ class sscp_kernel_launcher
       assert(backend_params);
       sycl::interop_handle handle{node->get_assigned_device(),
                                   backend_params};
-
-      launch_config.custom_op(handle);
+      auto *kernel_op =
+          static_cast<rt::kernel_operation *>(node->get_operation());
+      launch_config.custom_op(kernel_op, handle);
 
       return rt::make_success();
     } else {
diff --git a/include/hipSYCL/runtime/allocation_tracker.hpp b/include/hipSYCL/runtime/allocation_tracker.hpp
new file mode 100644
index 000000000..48a830158
--- /dev/null
+++ b/include/hipSYCL/runtime/allocation_tracker.hpp
@@ -0,0 +1,33 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_ALLOCATION_TRACKER_HPP
+#define ACPP_ALLOCATION_TRACKER_HPP
+
+#include <cstdint>
+#include "runtime_event_handlers.hpp"
+#include "hipSYCL/common/allocation_map.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+class allocation_tracker {
+public:
+  static bool query_allocation(const void *ptr, allocation_info &out,
+                               uint64_t &root_address);
+  static bool register_allocation(const void *ptr, std::size_t size,
+                                  const allocation_info &info);
+  static bool unregister_allocation(const void* ptr);
+};
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/runtime/allocator.hpp b/include/hipSYCL/runtime/allocator.hpp
index 240d07d2a..79375e0b7 100644
--- a/include/hipSYCL/runtime/allocator.hpp
+++ b/include/hipSYCL/runtime/allocator.hpp
@@ -33,14 +33,24 @@ struct pointer_info {
 class backend_allocator
 {
 public:
-  virtual void *allocate(size_t min_alignment, size_t size_bytes) = 0;
-  // Optimized host memory - may be page-locked, device mapped if supported
-  virtual void* allocate_optimized_host(size_t min_alignment, size_t bytes) = 0;
-  virtual void free(void *mem) = 0;
+
+  /// Raw allocation mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void *raw_allocate(size_t min_alignment, size_t size_bytes) = 0;
+  /// Optimized host memory - may be page-locked, device mapped if supported
+  /// Raw mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void* raw_allocate_optimized_host(size_t min_alignment, size_t bytes) = 0;
+  /// Raw free mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void raw_free(void *mem) = 0;
+  /// Allocate memory accessible both from the host and the backend.
+  /// Raw mechanism that does not interact with the runtime's
+  /// event handler mechanism. Should not be called directly in most cases.
+  virtual void *raw_allocate_usm(size_t bytes) = 0;
   
+  virtual device_id get_device() const = 0;
 
-  /// Allocate memory accessible both from the host and the backend
-  virtual void *allocate_usm(size_t bytes) = 0;
   virtual bool is_usm_accessible_from(backend_descriptor b) const = 0;
 
   // Query the given pointer for its properties. If pointer is unknown,
@@ -53,6 +63,14 @@ class backend_allocator
   virtual ~backend_allocator(){}
 };
 
+void *allocate_device(backend_allocator *alloc, size_t min_alignment,
+                      size_t size_bytes);
+void *allocate_host(backend_allocator *alloc, size_t min_alignment,
+                              size_t bytes);
+void *allocate_shared(backend_allocator* alloc, size_t bytes);
+void deallocate(backend_allocator* alloc, void *mem);
+
+
 }
 }
 
diff --git a/include/hipSYCL/runtime/application.hpp b/include/hipSYCL/runtime/application.hpp
index 4cb86bcc9..7f426ca69 100644
--- a/include/hipSYCL/runtime/application.hpp
+++ b/include/hipSYCL/runtime/application.hpp
@@ -16,6 +16,7 @@
 #include "backend.hpp"
 #include "device_id.hpp"
 #include "settings.hpp"
+#include "runtime_event_handlers.hpp"
 
 namespace hipsycl {
 namespace rt {
@@ -24,6 +25,7 @@ class dag_manager;
 class runtime;
 class async_error_list;
 
+
 class application
 {
 public:
@@ -32,6 +34,7 @@ class application
   // from the runtime or kernel launchers.
   static std::shared_ptr<runtime> get_runtime_pointer();
   static async_error_list& errors();
+  static runtime_event_handlers& event_handler_layer();
 
   application() = delete;
 };
diff --git a/include/hipSYCL/runtime/cuda/cuda_allocator.hpp b/include/hipSYCL/runtime/cuda/cuda_allocator.hpp
index 50e514237..0497f865c 100644
--- a/include/hipSYCL/runtime/cuda/cuda_allocator.hpp
+++ b/include/hipSYCL/runtime/cuda/cuda_allocator.hpp
@@ -21,20 +21,22 @@ class cuda_allocator : public backend_allocator
 public:
   cuda_allocator(backend_descriptor desc, int cuda_device);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
-                                        size_t bytes) override;
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
+                                            size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   backend_descriptor _backend_descriptor;
   int _dev;
diff --git a/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp b/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp
index bc58a7d3c..4086609b9 100644
--- a/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/cuda/cuda_hardware_manager.hpp
@@ -52,6 +52,8 @@ class cuda_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~cuda_hardware_context();
 
   cuda_allocator* get_allocator() const;
@@ -74,6 +76,8 @@ class cuda_hardware_manager : public backend_hardware_manager
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
 
+  virtual std::size_t get_num_platforms() const override;
+
   virtual ~cuda_hardware_manager() {}
   
 private:
diff --git a/include/hipSYCL/runtime/cuda/cuda_queue.hpp b/include/hipSYCL/runtime/cuda/cuda_queue.hpp
index 0003b6068..4c9e07a88 100644
--- a/include/hipSYCL/runtime/cuda/cuda_queue.hpp
+++ b/include/hipSYCL/runtime/cuda/cuda_queue.hpp
@@ -22,6 +22,7 @@
 #include "hipSYCL/runtime/code_object_invoker.hpp"
 #include "hipSYCL/runtime/cuda/cuda_event.hpp"
 #include "hipSYCL/runtime/kernel_configuration.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 
 
 // Forward declare CUstream_st instead of including cuda_runtime_api.h.
@@ -138,6 +139,7 @@ class cuda_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/data.hpp b/include/hipSYCL/runtime/data.hpp
index c80bca32e..6879e5016 100644
--- a/include/hipSYCL/runtime/data.hpp
+++ b/include/hipSYCL/runtime/data.hpp
@@ -417,7 +417,7 @@ class data_region
                  "a memory leak."
               << std::endl;
         } else {
-          alloc.managing_allocator->free(alloc.memory);
+          rt::deallocate(alloc.managing_allocator, alloc.memory);
         }
       }
       return true;
diff --git a/include/hipSYCL/runtime/device_id.hpp b/include/hipSYCL/runtime/device_id.hpp
index 0a346a388..0f9704862 100644
--- a/include/hipSYCL/runtime/device_id.hpp
+++ b/include/hipSYCL/runtime/device_id.hpp
@@ -14,6 +14,7 @@
 #include <functional>
 #include <cassert>
 #include <ostream>
+#include <cstdint>
 
 namespace hipsycl {
 namespace rt {
@@ -101,6 +102,12 @@ class device_id
   {
     return !(a == b);
   }
+
+  uint64_t hash_code() const {
+    uint32_t backend = static_cast<uint32_t>(_backend.id);
+    uint32_t id = _device_id;
+    return (static_cast<uint64_t>(backend) << 32) | id;
+  }
 private:
   backend_descriptor _backend;
   int _device_id;
diff --git a/include/hipSYCL/runtime/hardware.hpp b/include/hipSYCL/runtime/hardware.hpp
index 4dc7d8495..13083fe8b 100644
--- a/include/hipSYCL/runtime/hardware.hpp
+++ b/include/hipSYCL/runtime/hardware.hpp
@@ -93,7 +93,9 @@ enum class device_uint_property {
   printf_buffer_size,
   partition_max_sub_devices,
 
-  vendor_id
+  vendor_id,
+  architecture,
+  backend_id
 };
 
 enum class device_uint_list_property {
@@ -125,6 +127,8 @@ class hardware_context
 
   virtual std::string get_driver_version() const = 0;
   virtual std::string get_profile() const = 0;
+
+  virtual std::size_t get_platform_index() const= 0;
   
   virtual ~hardware_context(){}
 };
@@ -133,6 +137,8 @@ class backend_hardware_manager
 {
 public:
   virtual std::size_t get_num_devices() const = 0;
+  virtual std::size_t get_num_platforms() const = 0;
+
   virtual hardware_context *get_device(std::size_t index) = 0;
   virtual device_id get_device_id(std::size_t index) const = 0;
 
diff --git a/include/hipSYCL/runtime/hip/hip_allocator.hpp b/include/hipSYCL/runtime/hip/hip_allocator.hpp
index 7956e80f4..89b8c4a7d 100644
--- a/include/hipSYCL/runtime/hip/hip_allocator.hpp
+++ b/include/hipSYCL/runtime/hip/hip_allocator.hpp
@@ -21,20 +21,22 @@ class hip_allocator : public backend_allocator
 public:
   hip_allocator(backend_descriptor desc, int hip_device);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
-                                        size_t bytes) override;
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
+                                           size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   backend_descriptor _backend_descriptor;
   int _dev;
diff --git a/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp b/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
index 9b1336127..e0ba8fbf8 100644
--- a/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/hip/hip_hardware_manager.hpp
@@ -53,6 +53,8 @@ class hip_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~hip_hardware_context() {}
 
   hip_allocator* get_allocator() const;
@@ -62,6 +64,9 @@ class hip_hardware_context : public hardware_context
   std::unique_ptr<hip_allocator> _allocator;
   std::unique_ptr<hip_event_pool> _event_pool;
   int _dev;
+  // target amdgcn architecture in numeric, hexadecimal form, e.g.
+  // gfx906 is represented as 0x906.
+  int _numeric_architecture;
 };
 
 class hip_hardware_manager : public backend_hardware_manager
@@ -72,6 +77,7 @@ class hip_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~hip_hardware_manager() {}
   
diff --git a/include/hipSYCL/runtime/hip/hip_queue.hpp b/include/hipSYCL/runtime/hip/hip_queue.hpp
index a22e7563f..371e2f4f7 100644
--- a/include/hipSYCL/runtime/hip/hip_queue.hpp
+++ b/include/hipSYCL/runtime/hip/hip_queue.hpp
@@ -18,6 +18,7 @@
 
 #include "hipSYCL/common/spin_lock.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 #include "hip_instrumentation.hpp"
 
 // Avoid including HIP headers to prevent conflicts with CUDA
@@ -128,6 +129,7 @@ class hip_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/kernel_cache.hpp b/include/hipSYCL/runtime/kernel_cache.hpp
index 3c6e7904a..4143ed16a 100644
--- a/include/hipSYCL/runtime/kernel_cache.hpp
+++ b/include/hipSYCL/runtime/kernel_cache.hpp
@@ -109,7 +109,8 @@ class hcf_kernel_info {
 
   enum annotation_type {
     specialized,
-    fcall_specialized_config
+    fcall_specialized_config,
+    noalias
   };
 
   std::size_t get_argument_offset(std::size_t i) const;
diff --git a/include/hipSYCL/runtime/kernel_configuration.hpp b/include/hipSYCL/runtime/kernel_configuration.hpp
index 88d796a20..99521833e 100644
--- a/include/hipSYCL/runtime/kernel_configuration.hpp
+++ b/include/hipSYCL/runtime/kernel_configuration.hpp
@@ -69,6 +69,12 @@ enum class kernel_build_flag : int {
   spirv_enable_intel_llvm_spirv_options
 };
 
+enum class kernel_param_flag : int {
+  // these values are used as bit masks and should
+  // always have a value of a power of 2
+  noalias = 1
+};
+
 
 
 std::string to_string(kernel_build_flag f);
@@ -83,62 +89,6 @@ to_build_flag(const std::string& s);
 
 
 class kernel_configuration {
-
-  class s2_ir_configuration_entry {
-    static constexpr std::size_t buffer_size = 8;
-
-    std::string _name;
-    std::type_index _type;
-    std::array<int8_t, buffer_size> _value;
-    std::size_t _data_size;
-    
-
-    template<class T>
-    void store(const T& val) {
-      static_assert(sizeof(T) <= buffer_size,
-                    "Unsupported kernel configuration value type");
-      for(int i = 0; i < _value.size(); ++i)
-        _value[i] = 0;
-      
-      memcpy(_value.data(), &val, sizeof(val));
-    }
-
-  public:
-    template<class T>
-    s2_ir_configuration_entry(const std::string& name, const T& val)
-    : _name{name}, _type{typeid(T)}, _data_size{sizeof(T)} {
-      store<T>(val);
-    }
-
-    template<class T>
-    T get_value() const {
-      static_assert(sizeof(T) <= buffer_size,
-                    "Unsupported kernel configuration value type");
-      T v;
-      memcpy(&v, _value.data(), sizeof(T));
-      return v;
-    }
-
-    template<class T>
-    bool is_type() const {
-      return _type == typeid(T);
-    }
-
-    const void* get_data_buffer() const {
-      return _value.data();
-    }
-
-    std::size_t get_data_size() const {
-      return _data_size;
-    }
-
-    const std::string& get_name() const {
-      return _name;
-    }
-  };
-
-
-
 public:
   struct int_or_string{
     std::optional<uint64_t> int_value;
@@ -147,23 +97,23 @@ class kernel_configuration {
 
   using id_type = std::array<uint64_t, 2>;
 
-  template<class T>
-  void set_s2_ir_constant(const std::string& config_parameter_name, const T& value) {
-    s2_ir_configuration_entry entry{config_parameter_name, value};
-    for(int i = 0; i < _s2_ir_configurations.size(); ++i) {
-      if(_s2_ir_configurations[i].get_name() == config_parameter_name) {
-        _s2_ir_configurations[i] = entry;
+  void set_specialized_kernel_argument(int param_index, uint64_t buffer_value) {
+    for(int i = 0; i < _specialized_kernel_args.size(); ++i) {
+      if(_specialized_kernel_args[i].first == param_index) {
+        _specialized_kernel_args[i] = std::make_pair(param_index, buffer_value);
         return;
       }
     }
-    _s2_ir_configurations.push_back(entry);
-  }
-
-  void set_specialized_kernel_argument(int param_index, uint64_t buffer_value) {
     _specialized_kernel_args.push_back(
         std::make_pair(param_index, buffer_value));
   }
 
+  void set_kernel_param_flag(int param_index, kernel_param_flag flag) {
+    if(_kernel_param_flags.size() <= param_index)
+      _kernel_param_flags.resize(param_index+1, 0);
+    _kernel_param_flags[param_index] |= static_cast<uint64_t>(flag);
+  }
+
   void set_function_call_specialization_config(
       int param_index, glue::sscp::fcall_config_kernel_property_t config) {
     _function_call_specializations.push_back(config);
@@ -191,6 +141,16 @@ class kernel_configuration {
     _build_flags.push_back(flag);
   }
 
+  void set_known_alignment(int param_index, int alignment) {
+    for(auto& entry : _known_alignments) {
+      if(entry.first == param_index) {
+        entry.second = alignment;
+        return;
+      }
+    }
+    _known_alignments.push_back(std::make_pair(param_index, alignment));
+  }
+
   template <class ValueT>
   void append_base_configuration(kernel_base_config_parameter key,
                                  const ValueT &value) {
@@ -211,12 +171,6 @@ class kernel_configuration {
   id_type generate_id() const {
     id_type result = _base_configuration_result;
 
-    for(const auto& entry : _s2_ir_configurations) {
-      add_entry_to_hash(result, entry.get_name().data(),
-                        entry.get_name().size(), entry.get_data_buffer(),
-                        entry.get_data_size());
-    }
-
     for(const auto& entry : _build_options) {
       uint64_t numeric_option_id = static_cast<uint64_t>(entry.first) | (1ull << 32);
       if(entry.second.int_value.has_value()) {
@@ -249,11 +203,23 @@ class kernel_configuration {
                         &config_id, sizeof(config_id));
     }
 
-    return result;
-  }
+    for(int i = 0; i < _kernel_param_flags.size(); ++i) {
+      if(_kernel_param_flags[i] != 0) {
+        auto flags = _kernel_param_flags[i];
+        uint64_t numeric_option_id = static_cast<uint64_t>(i) | (1ull << 36);
+        add_entry_to_hash(result, &numeric_option_id, sizeof(numeric_option_id),
+                          &flags, sizeof(flags));
+      }
+    }
+    
+    for(const auto& entry : _known_alignments) {
+      uint64_t numeric_option_id = static_cast<uint64_t>(entry.first) | (1ull << 37);
+      uint64_t config_id = entry.second;
+      add_entry_to_hash(result, &numeric_option_id, sizeof(numeric_option_id),
+                        &config_id, sizeof(config_id));
+    }
 
-  const auto& s2_ir_entries() const {
-    return _s2_ir_configurations;
+    return result;
   }
 
   const auto& build_options() const {
@@ -272,9 +238,25 @@ class kernel_configuration {
     return _function_call_specializations;
   }
 
+  bool has_kernel_param_flag(int param_index, kernel_param_flag flag) const {
+    if(param_index < _kernel_param_flags.size()) {
+      return _kernel_param_flags[param_index] & static_cast<uint64_t>(flag);
+    }
+
+    return false;
+  }
+
+  std::size_t get_num_kernel_param_indices() const {
+    return _kernel_param_flags.size();
+  }
+  
+  const auto& known_alignments() const {
+    return _known_alignments;
+  }
+
 private:
   static const void* data_ptr(const char* data) {
-    return data_ptr(std::string{data});
+    return data_ptr(data);
   }
 
   static const void* data_ptr(const std::string& data) {
@@ -296,7 +278,7 @@ class kernel_configuration {
   }
 
   static std::size_t data_size(const char* data) {
-    return data_size(std::string{data});
+    return data_size(std::string_view{data});
   }
 
   static std::size_t data_size(const std::string& data) {
@@ -327,13 +309,13 @@ class kernel_configuration {
     hash[entry_hash % hash.size()] ^= entry_hash;
   }
 
-
-  std::vector<s2_ir_configuration_entry> _s2_ir_configurations;
   std::vector<kernel_build_flag> _build_flags;
   std::vector<std::pair<kernel_build_option, int_or_string>> _build_options;
   std::vector<std::pair<int, uint64_t>> _specialized_kernel_args;
   std::vector<glue::sscp::fcall_config_kernel_property_t>
       _function_call_specializations;
+  std::vector<uint64_t> _kernel_param_flags;
+  std::vector<std::pair<int, int>> _known_alignments;
 
   id_type _base_configuration_result = {};
 };
diff --git a/include/hipSYCL/runtime/ocl/ocl_allocator.hpp b/include/hipSYCL/runtime/ocl/ocl_allocator.hpp
index 85ab58748..4c45e5daf 100644
--- a/include/hipSYCL/runtime/ocl/ocl_allocator.hpp
+++ b/include/hipSYCL/runtime/ocl/ocl_allocator.hpp
@@ -23,16 +23,16 @@ class ocl_allocator : public backend_allocator
 {
 public:
   ocl_allocator() = default;
-  ocl_allocator(ocl_usm* usm_provier);
+  ocl_allocator(rt::device_id dev, ocl_usm* usm_provier);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
                                         size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
@@ -40,8 +40,10 @@ class ocl_allocator : public backend_allocator
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
 
+  virtual device_id get_device() const override;
 private:
   ocl_usm* _usm;
+  rt::device_id _dev;
 };
 
 }
diff --git a/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp b/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp
index 156ae1a10..c204f0990 100644
--- a/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/ocl/ocl_hardware_manager.hpp
@@ -57,6 +57,8 @@ class ocl_hardware_context : public hardware_context
 
   virtual ~ocl_hardware_context();
 
+  virtual std::size_t get_platform_index() const override;
+
   ocl_allocator* get_allocator();
   ocl_usm* get_usm_provider();
 
@@ -86,6 +88,7 @@ class ocl_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~ocl_hardware_manager() {}
   
diff --git a/include/hipSYCL/runtime/ocl/ocl_queue.hpp b/include/hipSYCL/runtime/ocl/ocl_queue.hpp
index ff4de7fde..0141488ec 100644
--- a/include/hipSYCL/runtime/ocl/ocl_queue.hpp
+++ b/include/hipSYCL/runtime/ocl/ocl_queue.hpp
@@ -19,6 +19,7 @@
 
 #include "hipSYCL/common/spin_lock.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 #include "hipSYCL/runtime/event.hpp"
 #include "hipSYCL/runtime/generic/async_worker.hpp"
 #include "hipSYCL/runtime/ocl/ocl_code_object.hpp"
@@ -107,6 +108,7 @@ class ocl_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/omp/omp_allocator.hpp b/include/hipSYCL/runtime/omp/omp_allocator.hpp
index d74375ca5..8487a8c02 100644
--- a/include/hipSYCL/runtime/omp/omp_allocator.hpp
+++ b/include/hipSYCL/runtime/omp/omp_allocator.hpp
@@ -21,14 +21,14 @@ class omp_allocator : public backend_allocator
 public:
   omp_allocator(const device_id &my_device);
   
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
                                         size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void *ptr,
@@ -36,10 +36,14 @@ class omp_allocator : public backend_allocator
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   device_id _my_device;
 };
 
+
+
 }
 }
 
diff --git a/include/hipSYCL/runtime/omp/omp_code_object.hpp b/include/hipSYCL/runtime/omp/omp_code_object.hpp
index 0a4f655d0..576495777 100644
--- a/include/hipSYCL/runtime/omp/omp_code_object.hpp
+++ b/include/hipSYCL/runtime/omp/omp_code_object.hpp
@@ -29,14 +29,17 @@ class omp_sscp_executable_object : public code_object {
   // The kernel argument struct providing work-group information.
   struct work_group_info {
     work_group_info(rt::range<3> num_groups, rt::id<3> group_id,
-                    rt::range<3> local_size, void* local_memory)
+                    rt::range<3> local_size, void *local_memory,
+                    void *internal_local_memory)
         : _num_groups(num_groups), _group_id(group_id), _local_size(local_size),
-          _local_memory(local_memory) {}
+          _local_memory(local_memory),
+          _internal_local_memory(internal_local_memory) {}
 
     rt::range<3> _num_groups;
     rt::range<3> _group_id;
     rt::range<3> _local_size;
     void* _local_memory;
+    void* _internal_local_memory;
   };
 
   using omp_sscp_kernel = void(const work_group_info *, void **);
diff --git a/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp b/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp
index 74eac0e05..e38547799 100644
--- a/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/omp/omp_hardware_manager.hpp
@@ -40,6 +40,8 @@ class omp_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~omp_hardware_context() {}
 };
 
@@ -49,6 +51,7 @@ class omp_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~omp_hardware_manager(){}
 private:
diff --git a/include/hipSYCL/runtime/omp/omp_queue.hpp b/include/hipSYCL/runtime/omp/omp_queue.hpp
index 52d3320b9..b5a653ee7 100644
--- a/include/hipSYCL/runtime/omp/omp_queue.hpp
+++ b/include/hipSYCL/runtime/omp/omp_queue.hpp
@@ -17,11 +17,13 @@
 #include "../device_id.hpp"
 #include "hipSYCL/common/spin_lock.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 
 namespace hipsycl {
 namespace rt {
 
 class omp_queue;
+class omp_backend;
 
 class omp_sscp_code_object_invoker : public sscp_code_object_invoker {
 public:
@@ -50,7 +52,7 @@ class omp_sscp_code_object_invoker : public sscp_code_object_invoker {
 class omp_queue : public inorder_queue
 {
 public:
-  omp_queue(backend_id id);
+  omp_queue(omp_backend* be, int dev);
   virtual ~omp_queue();
 
   /// Inserts an event into the stream
@@ -94,6 +96,7 @@ class omp_queue : public inorder_queue
   common::spin_lock _sscp_submission_spin_lock;
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/runtime/runtime_event_handlers.hpp b/include/hipSYCL/runtime/runtime_event_handlers.hpp
new file mode 100644
index 000000000..9b841cbd1
--- /dev/null
+++ b/include/hipSYCL/runtime/runtime_event_handlers.hpp
@@ -0,0 +1,48 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_RT_EVENT_HANDLERS_HPP
+#define ACPP_RT_EVENT_HANDLERS_HPP
+
+#include <memory>
+
+#include "backend.hpp"
+#include "device_id.hpp"
+#include "settings.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+struct allocation_info {
+  enum class allocation_type {
+    device,
+    shared,
+    host
+  };
+
+  rt::device_id dev;
+  allocation_type alloc_type;
+};
+
+class runtime_event_handlers {
+public:
+  runtime_event_handlers();
+  void on_new_allocation(const void*, std::size_t, const allocation_info& info);
+  void on_deallocation(const void* ptr);
+private:
+  bool _needs_allocation_tracking;
+};
+
+
+}
+}
+
+
+#endif
diff --git a/include/hipSYCL/runtime/settings.hpp b/include/hipSYCL/runtime/settings.hpp
index 83040837a..59b0781a6 100644
--- a/include/hipSYCL/runtime/settings.hpp
+++ b/include/hipSYCL/runtime/settings.hpp
@@ -107,7 +107,8 @@ enum class setting {
   adaptivity_level,
   jitopt_iads_relative_threshold,
   jitopt_iads_relative_eviction_threshold,
-  jitopt_iads_relative_threshold_min_data
+  jitopt_iads_relative_threshold_min_data,
+  enable_allocation_tracking
 };
 
 template <setting S> struct setting_trait {};
@@ -146,6 +147,7 @@ HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::jitopt_iads_relative_eviction_threshold,
 HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::jitopt_iads_relative_threshold_min_data,
                               "jitopt_iads_relative_threshold_min_data",
                               std::size_t)
+HIPSYCL_RT_MAKE_SETTING_TRAIT(setting::enable_allocation_tracking, "allocation_tracking", bool)
 
 class settings
 {
@@ -190,6 +192,8 @@ class settings
       return _jitopt_iads_relative_threshold_min_data;
     } else if constexpr(S == setting::jitopt_iads_relative_eviction_threshold) {
       return _jitopt_iads_relative_eviction_threshold;
+    } else if constexpr(S == setting::enable_allocation_tracking) {
+      return _enable_allocation_tracking;
     }
     return typename setting_trait<S>::type{};
   }
@@ -242,6 +246,8 @@ class settings
         get_environment_variable_or_default<setting::jitopt_iads_relative_eviction_threshold>(0.1);
     _jitopt_iads_relative_threshold_min_data =
         get_environment_variable_or_default<setting::jitopt_iads_relative_threshold_min_data>(1024);
+    _enable_allocation_tracking =
+        get_environment_variable_or_default<setting::enable_allocation_tracking>(false);
   }
 
 private:
@@ -273,6 +279,7 @@ class settings
   double _jitopt_iads_relative_threshold;
   double _jitopt_iads_relative_eviction_threshold;
   std::size_t _jitopt_iads_relative_threshold_min_data;
+  bool _enable_allocation_tracking;
 };
 
 }
diff --git a/include/hipSYCL/runtime/ze/ze_allocator.hpp b/include/hipSYCL/runtime/ze/ze_allocator.hpp
index eee8f2f69..adee72158 100644
--- a/include/hipSYCL/runtime/ze/ze_allocator.hpp
+++ b/include/hipSYCL/runtime/ze/ze_allocator.hpp
@@ -12,6 +12,7 @@
 #define HIPSYCL_ZE_ALLOCATOR_HPP
 
 #include "../allocator.hpp"
+#include "hipSYCL/runtime/device_id.hpp"
 #include "ze_hardware_manager.hpp"
 
 #include <level_zero/ze_api.h>
@@ -22,26 +23,30 @@ namespace rt {
 class ze_allocator : public backend_allocator 
 {
 public:
-  ze_allocator(const ze_hardware_context* dev, const ze_hardware_manager* hw_manager);
+  ze_allocator(std::size_t device_index, const ze_hardware_context *dev,
+               const ze_hardware_manager *hw_manager);
 
-  virtual void* allocate(size_t min_alignment, size_t size_bytes) override;
+  virtual void* raw_allocate(size_t min_alignment, size_t size_bytes) override;
 
-  virtual void *allocate_optimized_host(size_t min_alignment,
-                                        size_t bytes) override;
+  virtual void *raw_allocate_optimized_host(size_t min_alignment,
+                                            size_t bytes) override;
   
-  virtual void free(void *mem) override;
+  virtual void raw_free(void *mem) override;
 
-  virtual void *allocate_usm(size_t bytes) override;
+  virtual void *raw_allocate_usm(size_t bytes) override;
   virtual bool is_usm_accessible_from(backend_descriptor b) const override;
 
   virtual result query_pointer(const void* ptr, pointer_info& out) const override;
 
   virtual result mem_advise(const void *addr, std::size_t num_bytes,
                             int advise) const override;
+
+  virtual device_id get_device() const override;
 private:
   ze_context_handle_t _ctx;
   ze_device_handle_t _dev;
   uint32_t _global_mem_ordinal;
+  device_id _dev_id;
 
   const ze_hardware_manager* _hw_manager;
 };
diff --git a/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp b/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp
index 0b411c4ad..88a21e711 100644
--- a/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp
+++ b/include/hipSYCL/runtime/ze/ze_hardware_manager.hpp
@@ -88,6 +88,8 @@ class ze_hardware_context : public hardware_context
   virtual std::string get_driver_version() const override;
   virtual std::string get_profile() const override;
 
+  virtual std::size_t get_platform_index() const override;
+
   virtual ~ze_hardware_context();
 
   ze_driver_handle_t get_ze_driver() const
@@ -119,6 +121,7 @@ class ze_hardware_manager : public backend_hardware_manager
   virtual std::size_t get_num_devices() const override;
   virtual hardware_context *get_device(std::size_t index) override;
   virtual device_id get_device_id(std::size_t index) const override;
+  virtual std::size_t get_num_platforms() const override;
 
   virtual ~ze_hardware_manager() {}
   
diff --git a/include/hipSYCL/runtime/ze/ze_queue.hpp b/include/hipSYCL/runtime/ze/ze_queue.hpp
index c37ceef73..045e4f115 100644
--- a/include/hipSYCL/runtime/ze/ze_queue.hpp
+++ b/include/hipSYCL/runtime/ze/ze_queue.hpp
@@ -18,6 +18,7 @@
 #include "../executor.hpp"
 #include "../inorder_queue.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/reflection_map.hpp"
 #include "hipSYCL/runtime/code_object_invoker.hpp"
 #include "hipSYCL/runtime/event.hpp"
 #include "hipSYCL/runtime/hints.hpp"
@@ -107,6 +108,7 @@ class ze_queue : public inorder_queue
   // SSCP submission data
   glue::jit::cxx_argument_mapper _arg_mapper;
   kernel_configuration _config;  
+  glue::jit::reflection_map _reflection_map;
 };
 
 }
diff --git a/include/hipSYCL/std/stdpar/detail/allocation_map.hpp b/include/hipSYCL/std/stdpar/detail/allocation_map.hpp
index 93ce420d3..e24d0ff42 100644
--- a/include/hipSYCL/std/stdpar/detail/allocation_map.hpp
+++ b/include/hipSYCL/std/stdpar/detail/allocation_map.hpp
@@ -8,8 +8,8 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#ifndef HIPSYCL_ALLOCATION_MAP_HPP
-#define HIPSYCL_ALLOCATION_MAP_HPP
+#ifndef ACPP_STDPAR_ALLOCATION_DATA_STRUCTURES_HPP
+#define ACPP_STDPAR_ALLOCATION_DATA_STRUCTURES_HPP
 
 
 #include <cstddef>
@@ -22,468 +22,14 @@
 #include <array>
 #include <cassert>
 
+#include "hipSYCL/common/allocation_map.hpp"
+
 
 extern "C" void *__libc_malloc(size_t);
 extern "C" void __libc_free(void*);
 
 namespace hipsycl::stdpar {
 
-struct default_allocation_map_payload {};
-
-template<class Int_type, int... Bit_sizes>
-class bit_tree {
-protected:
-  
-  static constexpr int num_levels = sizeof...(Bit_sizes);
-  static constexpr int root_level_idx = num_levels - 1;
-  static constexpr int bitsizes[num_levels] = {Bit_sizes...};
-
-  static constexpr int get_num_entries_in_level(int level) {
-    return 1ull << bitsizes[level];
-  }
-
-  static constexpr int get_bitoffset_in_level(int level) {
-    int result = 0;
-    for(int i = 0; i < level; ++i) {
-      result += bitsizes[i];
-    }
-    return result;
-  }
-
-  static constexpr int get_index_in_level(Int_type address, int level) {
-    Int_type bitmask = get_n_low_bits_set(bitsizes[level]);
-    return (address >> get_bitoffset_in_level(level)) & bitmask;
-  }
-
-  static constexpr uint64_t get_n_low_bits_set(int n) {
-    if(n == 64)
-      return ~0ull;
-    return (1ull << n) - 1;
-  }
-
-  static constexpr uint64_t get_space_spanned_by_node_in_level(int level) {
-    uint64_t result = 1;
-    for(int i = 0; i < level; ++i)
-      result *= get_num_entries_in_level(level);
-    return result;
-  }
-
-  template<class T>
-  static T* alloc(int count) {
-    return static_cast<T*>(__libc_malloc(sizeof(T) * count));
-  }
-
-  static void free(void* ptr) {
-    __libc_free(ptr);
-  }
-};
-
-template <class UserPayload = default_allocation_map_payload>
-class allocation_map : public bit_tree<uint64_t, 
-  4, 4, 4, 4,  4, 4, 4, 4,
-  4, 4, 4, 4,  4, 4, 4, 4> {
-public:
-  static_assert(sizeof(void*) == 8, "Unsupported pointer size");
-  static_assert(std::is_trivial_v<UserPayload>, "UserPayload must be trivial type");
-
-  allocation_map()
-  : _num_in_progress_operations{0} {}
-
-  struct value_type : public UserPayload {
-    std::size_t allocation_size;
-  };
-
-  // Access entry of allocation that address belongs to, or nullptr if the address
-  // does not belong to a known allocation.
-  value_type* get_entry(uint64_t address, uint64_t& root_address) noexcept {
-    insert_or_get_entry_lock lock{_num_in_progress_operations};
-    root_address = 0;
-    int num_leaf_attempts = 0;
-    return get_entry(_root, address, num_leaf_attempts, root_address);
-  }
-
-  // Access entry of allocation that has the given address. Unlike get_entry(),
-  // this does not succeed if the address does not point to the base of the allocation.
-  value_type* get_entry_of_root_address(uint64_t address) noexcept {
-    insert_or_get_entry_lock lock{_num_in_progress_operations};
-    return get_entry_of_root_address(_root, address);
-  }
-
-  // Insert new element. Element's allocation range must be
-  // non-overlapping w.r.t existing entries.
-  // ~0ull is unsupported, because then non-zero allocation
-  // ranges cannot be expressed.
-  bool insert(uint64_t address, const value_type& v) {
-    insert_or_get_entry_lock lock{_num_in_progress_operations};
-    return insert(_root, address, v);
-  }
-
-  bool erase(uint64_t address) {
-    erase_lock lock{_num_in_progress_operations};
-    return erase(_root, address);
-  }
-
-  ~allocation_map() {
-    for(int i = 0; i < get_num_entries_in_level(root_level_idx); ++i) {
-      auto* ptr = _root.children[i].load(std::memory_order_acquire);
-      if(ptr)
-        release(*ptr);
-    }
-  }
-    
-private:
-  // Useful for debugging/printing
-  template<class F>
-  void with_decomposed_address(uint64_t address, int current_level, F&& handler) {
-    for(int i = root_level_idx; i >= current_level; --i) {
-      handler(get_index_in_level(address, i));
-    }
-    for(int i = current_level - 1; i >= 0; --i) {
-      handler(-1);
-    }
-  }
-
-  template<class Ostream>
-  void print(Ostream& ostr, uint64_t address, int level) {
-    with_decomposed_address(address, level, [&](int x){
-      if(x >= 0)
-        ostr << x << ".";
-      else
-        ostr << "x";
-    });
-    ostr << "\n";
-  }
-
-  struct leaf_node {
-    leaf_node()
-    : num_entries {} {
-      for(int i = 0; i < get_num_entries_in_level(0); ++i) {
-        entries[i].allocation_size = 0;
-      }
-    }
-
-    value_type entries [get_num_entries_in_level(0)];
-    std::atomic<int> num_entries;
-  };
-
-  template<int Level>
-  struct intermediate_node {
-  private:
-    static constexpr auto make_child() {
-      if constexpr (Level > 1) return 
-        intermediate_node<Level - 1>{};
-      else return leaf_node{};
-    }
-  public:
-    intermediate_node()
-    : children{}, num_entries{} {}
-
-    using child_type = decltype(make_child());
-
-    std::atomic<child_type*> children [get_num_entries_in_level(Level)];
-    std::atomic<int> num_entries;
-  };
-
-  value_type *get_entry(leaf_node &current_node, uint64_t address,
-                        int &/*num_leaf_attempts*/,
-                        uint64_t &root_address) noexcept {
-    int start_address = 0;
-
-    uint64_t max_local_address =
-        root_address | (get_num_entries_in_level(0) - 1);
-    
-    if(max_local_address <= address)
-      start_address = get_num_entries_in_level(0) - 1;
-    else
-      start_address = get_index_in_level(address, 0);
-
-    for (int local_address = start_address; local_address >= 0;
-         --local_address) {
-      
-      auto& element = current_node.entries[local_address];
-
-      std::size_t allocation_size =
-          __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
-      if(allocation_size > 0) {
-
-        uint64_t root_address_candidate =
-            root_address |
-            (static_cast<uint64_t>(local_address) << get_bitoffset_in_level(0));
-
-        uint64_t allocation_end = root_address_candidate + allocation_size;
-        if(address >= root_address_candidate && address < allocation_end) {
-          root_address = root_address_candidate;
-          return &element;
-        } else {
-          return nullptr;
-        }
-        
-      }
-    }
-    return nullptr;
-  }
-
-  template <int Level>
-  value_type *get_entry(intermediate_node<Level> &current_node,
-                        uint64_t address,
-                        int& num_leaf_attempts,
-                        uint64_t& root_address) noexcept {
-    // If the queried address is too close to the next allocation,
-    // it can happen that the search converges on the next allocation.
-    // Therefore, to exclude that case, if a search fails, we also
-    // need to try again with the next allocation before that.
-    // This variable counts how many leaves we have accessed. If it
-    // reaches two, we can abort.
-    if constexpr(Level == root_level_idx) {
-      num_leaf_attempts = 0;
-    }
-
-    uint64_t max_local_address =
-        root_address |
-        get_n_low_bits_set(get_bitoffset_in_level(Level) + bitsizes[Level]);
-
-    // We are always looking for the next allocation preceding the
-    // current address. If the maximum local address in this node
-    // cannot reach the search address, (e.g. if we are looking in
-    // a preceding node at the same level), we need to start from 
-    // the maximum address. Otherwise, we need to look at the bits
-    // set in this address.
-    int start_address = 0;
-    if(max_local_address <= address)
-      start_address = get_num_entries_in_level(Level) - 1;
-    else
-      start_address = get_index_in_level(address, Level);
-
-    for (int local_address = start_address;
-         local_address >= 0; --local_address) {
-      
-      auto *ptr = current_node.children[local_address].load(
-          std::memory_order_acquire);
-      
-      if(ptr) {
-        uint64_t root_address_candidate =
-            root_address | (static_cast<uint64_t>(local_address)
-                            << get_bitoffset_in_level(Level));
-
-        auto* ret = get_entry(*ptr, address, num_leaf_attempts,
-                              root_address_candidate);
-        // If we are in level 1, ret refers to a leaf node
-        if constexpr(Level == 1) {
-          ++num_leaf_attempts;
-        }
-
-        if(ret) {
-          root_address = root_address_candidate;
-          return ret;
-        } else if(num_leaf_attempts >= 2) {
-          // We can abort if we have looked at the first hit leaf node,
-          // and the one before that.
-          return nullptr;
-        }
-      }
-    }
-    return nullptr;
-  }
-
-  value_type *get_entry_of_root_address(leaf_node &current_node, uint64_t address) noexcept {
-    int local_address = get_index_in_level(address, 0);
-  
-    auto& element = current_node.entries[local_address];
-    std::size_t allocation_size =
-        __atomic_load_n(&(element.allocation_size), __ATOMIC_ACQUIRE);
-
-    if (allocation_size > 0) {
-      return &element;
-    }
-
-    return nullptr;
-  }
-
-  template <int Level>
-  value_type *get_entry_of_root_address(intermediate_node<Level> &current_node,
-                                        uint64_t address) noexcept {
-    int local_address = get_index_in_level(address, Level);
-  
-    auto *ptr = current_node.children[local_address].load(
-          std::memory_order_acquire);
-      
-    if(ptr) {
-      return get_entry_of_root_address(*ptr, address);
-    }
-    return nullptr;
-  }
-
-  bool insert(leaf_node &current_node, uint64_t address, const value_type &v) {
-
-    int local_address = get_index_in_level(address, 0);
-
-    std::size_t *allocation_size_ptr =
-        &(current_node.entries[local_address].allocation_size);
-
-    std::size_t allocation_size = __atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE);
-    if(allocation_size > 0) {
-      // Entry is already occupied
-      return false;
-    }
-    
-    __atomic_store_n(allocation_size_ptr, v.allocation_size, __ATOMIC_RELEASE);
-    current_node.entries[local_address].UserPayload::operator=(v);
-    
-    current_node.num_entries.fetch_add(
-        1, std::memory_order_acq_rel);
-
-    return true;
-  }
-
-  template <int Level>
-  bool insert(intermediate_node<Level> &current_node, uint64_t address,
-              const value_type &v) {
-    using child_t = typename intermediate_node<Level>::child_type;
-
-    int local_address = get_index_in_level(address, Level);
-    
-    auto *ptr = current_node.children[local_address].load(
-        std::memory_order_acquire);
-    
-    if(!ptr) {
-      child_t* new_child = alloc<child_t>(1);
-      new (new_child) child_t{};
-
-      if (!current_node.children[local_address].compare_exchange_strong(
-              ptr /* == nullptr*/, new_child, std::memory_order_acq_rel)) {
-        // Assigning new child has failed because child is no longer nullptr
-        // -> free new child again
-        destroy(*new_child);
-        this->free(new_child);
-      } else {
-        current_node.num_entries.fetch_add(
-            1, std::memory_order_acq_rel);
-        ptr = new_child;
-      }
-    }
-
-    return insert(*ptr, address, v);
-  }
-
-  bool erase(leaf_node& current_node, uint64_t address) {
-    int local_address = get_index_in_level(address, 0);
-
-    std::size_t *allocation_size_ptr =
-        &(current_node.entries[local_address].allocation_size);
-    // Entry was already deleted or does not exist
-    if(__atomic_load_n(allocation_size_ptr, __ATOMIC_ACQUIRE) == 0)
-      return false;
-
-    __atomic_store_n(allocation_size_ptr, 0, __ATOMIC_RELEASE);
-
-    current_node.num_entries.fetch_sub(
-        1, std::memory_order_acq_rel);
-    
-    return true;
-  }
-
-  template <int Level>
-  bool erase(intermediate_node<Level> &current_node, uint64_t address) {
-
-    int local_address = get_index_in_level(address, Level);
-    auto *ptr = current_node.children[local_address].load(
-        std::memory_order_acquire);
-    if(!ptr)
-      return false;
-    
-    bool result = erase(*ptr, address);
-    if(result) {
-      if(ptr->num_entries.load(std::memory_order_acquire) == 0) {
-        auto *current_ptr = current_node.children[local_address].exchange(
-            nullptr, std::memory_order_acq_rel);
-        // TODO: We could potentially get erase() lock-free
-        // by counting by how many ops each node is currently used,
-        // and waiting here until the count turns to 0.
-        if(current_ptr) {
-          destroy(*current_ptr);
-          this->free(current_ptr);
-          current_node.num_entries.fetch_sub(
-              1, std::memory_order_acq_rel);
-        }
-      }
-    }
-    return result;
-  }
-
-  void release(leaf_node& current_node) {
-    destroy(current_node);
-  }
-
-  template<int Level>
-  void release(intermediate_node<Level>& current_node) {
-    for(int i = 0; i < get_num_entries_in_level(Level); ++i){
-      if (auto *ptr = current_node.children[i].load(
-              std::memory_order_acquire)) {
-        release(*ptr);
-        this->free(ptr);
-      }
-    }
-    destroy(current_node);
-  }
-
-  void destroy(leaf_node& node) {
-    node.~leaf_node();
-  }
-
-  template<int Level>
-  void destroy(intermediate_node<Level>& node) {
-    node.~intermediate_node<Level>();
-  }
-
-  struct erase_lock {
-  public:
-    erase_lock(std::atomic<int>& op_counter)
-    : _op_counter{op_counter} {
-      int expected = 0;
-      while (!_op_counter.compare_exchange_strong(
-          expected, -1, std::memory_order_release, std::memory_order_relaxed)) {
-        expected = 0;
-      }
-    }
-
-    ~erase_lock() {
-      _op_counter.store(0, std::memory_order_release);
-    }
-  private:
-    std::atomic<int>& _op_counter;
-  };
-
-  struct insert_or_get_entry_lock {
-  public:
-    insert_or_get_entry_lock(std::atomic<int>& op_counter)
-    : _op_counter{op_counter} {
-      int expected = std::max(0, _op_counter.load(std::memory_order_acquire));
-      while (!_op_counter.compare_exchange_strong(
-          expected, expected+1, std::memory_order_release,
-          std::memory_order_relaxed)) {
-        if(expected < 0)
-          expected = 0;
-      }
-    }
-
-    ~insert_or_get_entry_lock() {
-      _op_counter.fetch_sub(1, std::memory_order_acq_rel);
-    }
-  private:
-   std::atomic<int>& _op_counter;
-  };
-
-  intermediate_node<root_level_idx> _root;
-  std::atomic<int> _num_in_progress_operations;
-};
-
-
-
-
-
-
-
-
 template <class T>
 class libc_allocator{
 public:
@@ -505,6 +51,17 @@ class libc_allocator{
   }
 };
 
+struct libc_untyped_allocator {
+  static void* allocate(size_t n) {
+    return __libc_malloc(n);
+  }
+
+  static void deallocate(void* ptr) {
+    __libc_free(ptr);
+  }
+};
+
+
 template <class T, class U>
 bool operator==(libc_allocator<T> const &, libc_allocator<U> const &) noexcept {
   return true;
@@ -515,6 +72,10 @@ bool operator!=(libc_allocator<T> const &x,
   return !(x == y);
 }
 
+template <class Payload>
+using allocation_map =
+    common::allocation_map<Payload, libc_untyped_allocator>;
+
 class free_space_map {
 public:
   free_space_map(std::size_t max_assignable_space)
diff --git a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
index b2bc1d8a2..c4e2809b7 100644
--- a/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
+++ b/include/hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp
@@ -50,7 +50,12 @@ struct find_if_not {};
 struct all_of {};
 struct any_of {};
 struct none_of {};
-
+struct sort {};
+struct merge {};
+struct inclusive_scan {};
+struct exclusive_scan {};
+struct transform_inclusive_scan {};
+struct transform_exclusive_scan {};
 struct transform_reduce {};
 struct reduce {};
 } // namespace algorithm_type
diff --git a/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp b/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp
index aefc0db98..a74af9f36 100644
--- a/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp
+++ b/include/hipSYCL/std/stdpar/detail/sycl_glue.hpp
@@ -32,6 +32,7 @@
 
 
 #include "allocation_map.hpp"
+#include "hipSYCL/runtime/application.hpp"
 #include "offload_heuristic_db.hpp"
 #include "hipSYCL/runtime/settings.hpp"
 #include "hipSYCL/sycl/info/device.hpp"
@@ -252,6 +253,15 @@ class memory_pool {
       assert(is_from_pool(ptr));
       assert(is_from_pool((char*)ptr+size));
       assert((uint64_t)ptr % _page_size == 0);
+
+      // Inform the runtime that there is a new user allocation
+      // by invoking the runtime hook. We need to do this manually
+      // because memory pool directly uses raw backend allocation commands.
+      rt::application::event_handler_layer().on_new_allocation(
+          ptr, size,
+          rt::allocation_info{_dev,
+                              rt::allocation_info::allocation_type::shared});
+
       return ptr;
     }
 
@@ -262,14 +272,14 @@ class memory_pool {
     if(_pool && is_from_pool(ptr)) {
       uint64_t address = reinterpret_cast<uint64_t>(ptr)-reinterpret_cast<uint64_t>(_base_address);
       _free_space_map.release(address, size);
+
+      rt::application::event_handler_layer().on_deallocation(ptr);
     }
   }
 
   ~memory_pool() {
     // Memory pool might be destroyed after runtime shutdown, so rely on OS
     // to clean up for now
-    //if(_pool)
-    //  sycl::free(_pool, detail::single_device_dispatch::get_queue());
   }
 
   std::size_t get_size() const {
@@ -285,13 +295,25 @@ class memory_pool {
   }
 private:
 
+  void* raw_malloc_shared(std::size_t bytes, sycl::queue& q) {
+    auto *allocator = sycl::detail::select_usm_allocator(q.get_context(),
+                                                         q.get_device());
+    return allocator->raw_allocate_usm(bytes);
+  }
+
   void init() {
     HIPSYCL_DEBUG_INFO << "[stdpar] Building a memory pool of size "
                        << static_cast<double>(_pool_size) / (1024 * 1024 * 1024)
                        << " GB" << std::endl;
+    auto& q = detail::single_device_dispatch::get_queue();
+    _dev = q.get_device().AdaptiveCpp_device_id();
+
+    // We need to call raw_allocate_usm so that we can inform the runtime's allocation
+    // tracking mechanism of actual user allocations, not just of the memory pool as a 
+    // whole.
     // Make sure to allocate an additional page so that we can fix alignment if needed
-    _pool = sycl::malloc_shared(
-        _pool_size + _page_size, detail::single_device_dispatch::get_queue());
+    _pool = raw_malloc_shared(_pool_size + _page_size, q);
+
     uint64_t aligned_pool_base = next_multiple_of((uint64_t)_pool, _page_size);
     _base_address = (void*)aligned_pool_base;
     assert(aligned_pool_base % _page_size == 0);
@@ -303,6 +325,7 @@ class memory_pool {
   void* _base_address;
   free_space_map _free_space_map;
   std::size_t _page_size;
+  rt::device_id _dev;
 };
 
 class unified_shared_memory {
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
index 4eea4ca21..89e2c085f 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/algorithm.hpp
@@ -20,6 +20,7 @@
 #include "../detail/stdpar_defs.hpp"
 #include "../detail/offload.hpp"
 #include "hipSYCL/algorithms/algorithm.hpp"
+#include "hipSYCL/algorithms/util/allocation_cache.hpp"
 #include "hipSYCL/std/stdpar/detail/offload_heuristic_db.hpp"
 
 namespace std {
@@ -153,9 +154,23 @@ ForwardIt2 copy_if(hipsycl::stdpar::par_unseq,
                    ForwardIt2 d_first,
                    UnaryPredicate pred) {
   auto offloader = [&](auto& queue){
+    auto output_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::host>();
+    auto device_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+    std::size_t *num_elements_copied =
+        output_scratch_group.obtain<std::size_t>(1);
+    
+    hipsycl::algorithms::copy_if(queue, device_scratch_group, first, last,
+                                 d_first, pred, num_elements_copied);
+    queue.wait();
+
     ForwardIt2 d_last = d_first;
-    std::advance(d_last, std::distance(first, last));
-    hipsycl::algorithms::copy_if(queue, first, last, d_first, pred);
+    std::advance(d_last, *num_elements_copied);
     return d_last;
   };
 
@@ -164,7 +179,7 @@ ForwardIt2 copy_if(hipsycl::stdpar::par_unseq,
                         d_first, pred);
   };
 
-  HIPSYCL_STDPAR_OFFLOAD(
+  HIPSYCL_STDPAR_BLOCKING_OFFLOAD(
       hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::copy_if{},
                                  hipsycl::stdpar::par_unseq{}),
       std::distance(first, last), ForwardIt2, offloader, fallback, first,
@@ -487,6 +502,113 @@ bool none_of(hipsycl::stdpar::par_unseq, ForwardIt first, ForwardIt last,
 
 
 
+template <class RandomIt>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par_unseq, RandomIt first,
+                                        RandomIt last) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last);
+  };
+
+  auto fallback = [&](){
+    std::sort(hipsycl::stdpar::par_unseq_host_fallback, first, last);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last));
+}
+
+
+template <class RandomIt, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par_unseq, RandomIt first,
+                                        RandomIt last, Compare comp) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last, comp);
+  };
+
+  auto fallback = [&]() {
+    std::sort(hipsycl::stdpar::par_unseq_host_fallback, first, last, comp);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), comp);
+}
+
+
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par_unseq,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first, comp);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_unseq_host_fallback, first1, last1,
+                      first2, last2, d_first, comp);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par_unseq{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first, comp);
+}
+
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par_unseq,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_unseq_host_fallback, first1, last1,
+                      first2, last2, d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par_unseq{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first);
+}
 
 
 //////////////////// par policy  /////////////////////////////////////
@@ -618,9 +740,23 @@ ForwardIt2 copy_if(hipsycl::stdpar::par,
                    ForwardIt2 d_first,
                    UnaryPredicate pred) {
   auto offloader = [&](auto& queue){
+    auto output_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::host>();
+    auto device_scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+    std::size_t *num_elements_copied =
+        output_scratch_group.obtain<std::size_t>(1);
+    
+    hipsycl::algorithms::copy_if(queue, device_scratch_group, first, last,
+                                 d_first, pred, num_elements_copied);
+    queue.wait();
+
     ForwardIt2 d_last = d_first;
-    std::advance(d_last, std::distance(first, last));
-    hipsycl::algorithms::copy_if(queue, first, last, d_first, pred);
+    std::advance(d_last, *num_elements_copied);
     return d_last;
   };
 
@@ -629,7 +765,7 @@ ForwardIt2 copy_if(hipsycl::stdpar::par,
                         d_first, pred);
   };
 
-  HIPSYCL_STDPAR_OFFLOAD(
+  HIPSYCL_STDPAR_BLOCKING_OFFLOAD(
       hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::copy_if{},
                                  hipsycl::stdpar::par{}),
       std::distance(first, last), ForwardIt2, offloader, fallback, first,
@@ -951,9 +1087,113 @@ bool none_of(hipsycl::stdpar::par, ForwardIt first, ForwardIt last,
       HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), p);
 }
 
+template <class RandomIt>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par, RandomIt first,
+                                        RandomIt last) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last);
+  };
 
+  auto fallback = [&](){
+    std::sort(hipsycl::stdpar::par_host_fallback, first, last);
+  };
 
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last));
+}
+
+template <class RandomIt, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT void sort(hipsycl::stdpar::par, RandomIt first,
+                                    RandomIt last, Compare comp) {
+  auto offloader = [&](auto& queue) {
+    hipsycl::algorithms::sort(queue, first, last, comp);
+  };
 
+  auto fallback = [&]() {
+    std::sort(hipsycl::stdpar::par_host_fallback, first, last, comp);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD_NORET(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::sort{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), comp);
+}
+
+
+
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first, Compare comp) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first, comp);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_unseq_host_fallback, first1, last1,
+                      first2, last2, d_first, comp);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par_unseq{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first, comp);
+}
+
+template<class ForwardIt1, class ForwardIt2,
+         class ForwardIt3, class Compare>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt3 merge(hipsycl::stdpar::par,
+                  ForwardIt1 first1, ForwardIt1 last1,
+                  ForwardIt2 first2, ForwardIt2 last2,
+                  ForwardIt3 d_first) {
+  auto offloader = [&](auto &queue) {
+    auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+
+    hipsycl::algorithms::merge(queue, scratch_group, first1, last1, first2,
+                               last2, d_first);
+    auto d_last = d_first;
+    std::advance(d_last,
+                 std::distance(first1, last1) + std::distance(first2, last2));
+    return d_last;
+  };
+
+  auto fallback = [&]() {
+    return std::merge(hipsycl::stdpar::par_host_fallback, first1, last1,
+                      first2, last2, d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(hipsycl::stdpar::algorithm_category::merge{},
+                                 hipsycl::stdpar::par{}),
+      std::distance(first1, last1) + std::distance(first2, last2), ForwardIt3,
+      offloader, fallback, first1, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last1),
+      first2, HIPSYCL_STDPAR_NO_PTR_VALIDATION(last2), d_first);
+}
 
 }
 
diff --git a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
index f8b3776dd..2f3b8d4d7 100644
--- a/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
+++ b/include/hipSYCL/std/stdpar/pstl-impl/numeric.hpp
@@ -291,6 +291,290 @@ T reduce(hipsycl::stdpar::par_unseq, ForwardIt first,
 }
 
 
+
+// scans
+
+
+template <class InputIt, class OutputIt, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op);
+  
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first,
+                                                  BinaryOp op, T init) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op, init);
+}
+
+template <class InputIt, class OutputIt>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par_unseq,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                               last, d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt
+exclusive_scan(hipsycl::stdpar::par_unseq,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init, op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, init, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, op);
+}
+
+template <class InputIt, class OutputIt, class T>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt exclusive_scan(hipsycl::stdpar::par_unseq,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_unseq_host_fallback, first,
+                          last, d_first, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init);
+}
+
+
+
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par_unseq, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(
+          queue, scratch_group, first, last, d_first, binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op);
+}
+
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp,
+          class T>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par_unseq, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op, T init) {
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, binary_op,
+                                                    unary_op, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_unseq_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op,
+      init);
+}
+
+template <class ForwardIt1, class ForwardIt2, class T, class BinaryOp,
+          class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt2 transform_exclusive_scan(hipsycl::stdpar::par_unseq, ForwardIt1 first,
+                                    ForwardIt1 last, ForwardIt2 d_first, T init,
+                                    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_exclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, init,
+                                                    binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_exclusive_scan(hipsycl::stdpar::par_unseq_host_fallback,
+                                         first, last, d_first, init, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_exclusive_scan{},
+          hipsycl::stdpar::par_unseq{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, binary_op,
+      unary_op);
+}
+
 //////////////////// par policy /////////////////////////////////////
 
 
@@ -559,6 +843,286 @@ T reduce(hipsycl::stdpar::par, ForwardIt first,
       HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), init, binary_op);
 }
 
+// scans
+
+
+template <class InputIt, class OutputIt, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt inclusive_scan(hipsycl::stdpar::par,
+               InputIt first, InputIt last, OutputIt d_first, BinaryOp op) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op);
+  
+}
+
+template <class InputIt, class OutputIt, class BinaryOp, class T>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first,
+                                                  BinaryOp op, T init) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first, op, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, op, init);
+}
+
+template <class InputIt, class OutputIt>
+HIPSYCL_STDPAR_ENTRYPOINT OutputIt inclusive_scan(hipsycl::stdpar::par,
+                                                  InputIt first, InputIt last,
+                                                  OutputIt d_first) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::inclusive_scan(queue, scratch_group, first, last,
+                                          d_first);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::inclusive_scan(hipsycl::stdpar::par_host_fallback, first, last,
+                               d_first);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first);
+}
+
+template <class InputIt, class OutputIt, class T, class BinaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt
+exclusive_scan(hipsycl::stdpar::par,
+               InputIt first, InputIt last, OutputIt d_first, T init,
+               BinaryOp op) {
+  
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init, op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, init, op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, op);
+}
+
+template <class InputIt, class OutputIt, class T>
+HIPSYCL_STDPAR_ENTRYPOINT
+OutputIt exclusive_scan(hipsycl::stdpar::par,
+                           InputIt first, InputIt last, OutputIt d_first,
+                           T init) {
+
+  auto offloader = [&](auto& queue){
+    OutputIt result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if(problem_size > 0) {
+      auto scratch_group =
+        hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+            .make_scratch_group<
+                hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::exclusive_scan(queue, scratch_group, first, last,
+                                          d_first, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::exclusive_scan(hipsycl::stdpar::par_host_fallback, first,
+                          last, d_first, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::exclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), OutputIt, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init);
+}
+
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(
+          queue, scratch_group, first, last, d_first, binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op);
+}
+
+template <class ForwardIt1, class ForwardIt2, class BinaryOp, class UnaryOp,
+          class T>
+HIPSYCL_STDPAR_ENTRYPOINT ForwardIt2 transform_inclusive_scan(
+    hipsycl::stdpar::par, ForwardIt1 first, ForwardIt1 last, ForwardIt2 d_first,
+    BinaryOp binary_op, UnaryOp unary_op, T init) {
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_inclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, binary_op,
+                                                    unary_op, init);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_inclusive_scan(hipsycl::stdpar::par_host_fallback,
+                                         first, last, d_first, binary_op,
+                                         unary_op, init);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_inclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, binary_op, unary_op,
+      init);
+}
+
+template <class ForwardIt1, class ForwardIt2, class T, class BinaryOp,
+          class UnaryOp>
+HIPSYCL_STDPAR_ENTRYPOINT
+ForwardIt2 transform_exclusive_scan(hipsycl::stdpar::par, ForwardIt1 first,
+                                    ForwardIt1 last, ForwardIt2 d_first, T init,
+                                    BinaryOp binary_op, UnaryOp unary_op) {
+
+  auto offloader = [&](auto &queue) {
+    ForwardIt2 result = d_first;
+    auto problem_size = std::distance(first, last);
+    std::advance(result, problem_size);
+    if (problem_size > 0) {
+      auto scratch_group =
+          hipsycl::stdpar::detail::stdpar_tls_runtime::get()
+              .make_scratch_group<
+                  hipsycl::algorithms::util::allocation_type::device>();
+      hipsycl::algorithms::transform_exclusive_scan(queue, scratch_group, first,
+                                                    last, d_first, init,
+                                                    binary_op, unary_op);
+    }
+    return result;
+  };
+
+  auto fallback = [&]() {
+    return std::transform_exclusive_scan(hipsycl::stdpar::par_host_fallback,
+                                         first, last, d_first, init, binary_op,
+                                         unary_op);
+  };
+
+  HIPSYCL_STDPAR_OFFLOAD(
+      hipsycl::stdpar::algorithm(
+          hipsycl::stdpar::algorithm_category::transform_exclusive_scan{},
+          hipsycl::stdpar::par{}),
+      std::distance(first, last), ForwardIt2, offloader, fallback, first,
+      HIPSYCL_STDPAR_NO_PTR_VALIDATION(last), d_first, init, binary_op,
+      unary_op);
+}
 
 
 }
diff --git a/include/hipSYCL/sycl/buffer.hpp b/include/hipSYCL/sycl/buffer.hpp
index 0fa83271d..91f027b2e 100644
--- a/include/hipSYCL/sycl/buffer.hpp
+++ b/include/hipSYCL/sycl/buffer.hpp
@@ -20,6 +20,7 @@
 #include <type_traits>
 #include <algorithm>
 #include <utility>
+#include <atomic>
 
 #include "hipSYCL/common/debug.hpp"
 #include "hipSYCL/runtime/allocator.hpp"
@@ -190,6 +191,21 @@ struct buffer_impl
   bool destructor_waits;
   bool use_external_storage;
 
+  buffer_impl() {
+    static std::atomic<bool> was_warning_emitted = false;
+    if(!was_warning_emitted) {
+      HIPSYCL_DEBUG_WARNING << "This application uses SYCL buffers; the SYCL "
+	      "buffer-accessor model is well-known to introduce unnecessary "
+	      "overheads. Please consider migrating to the SYCL2020 USM model, "
+	      "in particular device USM (sycl::malloc_device) combined with "
+	      "in-order queues for more performance. See the AdaptiveCpp "
+	      "performance guide for more information: \n"
+	      "https://github.com/AdaptiveCpp/AdaptiveCpp/blob/develop/doc/performance.md"
+	<< std::endl;
+      was_warning_emitted = true;
+    }
+  }
+
   ~buffer_impl() {
     if (writes_back) {
       if (!writeback_ptr) {
@@ -1194,17 +1210,18 @@ class buffer : public detail::property_carrying_object
     if(!_impl->data->has_allocation(host_device)){
       if(this->has_property<property::buffer::use_optimized_host_memory>()){
         // TODO: Actually may need to use non-host backend here...
-        host_ptr =
-            rt->backends().get(host_device.get_backend())
-                ->get_allocator(host_device)
-                ->allocate_optimized_host(
-                    alignof(T), _impl->data->get_num_elements().size() * sizeof(T));
+        auto* allocator = rt->backends().get(host_device.get_backend())
+                ->get_allocator(host_device);
+        host_ptr = rt::allocate_host(allocator, alignof(T),
+                                     _impl->data->get_num_elements().size() *
+                                         sizeof(T));
       } else {
-        host_ptr =
-            rt->backends().get(host_device.get_backend())
-                ->get_allocator(host_device)
-                ->allocate(
-                    alignof(T), _impl->data->get_num_elements().size() * sizeof(T));
+        auto *allocator = rt->backends()
+                              .get(host_device.get_backend())
+                              ->get_allocator(host_device);
+        host_ptr = rt::allocate_device(allocator, alignof(T),
+                                       _impl->data->get_num_elements().size() *
+                                           sizeof(T));
       }
 
       if(!host_ptr)
diff --git a/include/hipSYCL/sycl/context.hpp b/include/hipSYCL/sycl/context.hpp
index 572719e04..5da23dd55 100644
--- a/include/hipSYCL/sycl/context.hpp
+++ b/include/hipSYCL/sycl/context.hpp
@@ -32,6 +32,8 @@ class context;
 
 namespace detail {
 const rt::unique_device_list& extract_context_devices(const context&);
+
+struct default_context_tag_t{};
 }
 
 class context
@@ -84,6 +86,38 @@ class context
     _impl->devices.add(detail::get_host_device());
   }
 
+  explicit context(
+      detail::default_context_tag_t,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{handler} {
+    _impl->is_default_context = true;
+  }
+
+  explicit context(
+      detail::default_context_tag_t, const device &dev,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{dev, handler} {
+    _impl->is_default_context = true;
+  }
+
+  explicit context(
+      detail::default_context_tag_t, const platform &plt,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{plt, handler} {
+    _impl->is_default_context = true;
+  }
+
+  explicit context(
+      detail::default_context_tag_t, const std::vector<device> &deviceList,
+      async_handler handler =
+          [](exception_list e) { glue::default_async_handler(e); })
+      : context{deviceList, handler} {
+    _impl->is_default_context = true;
+  }
+
   bool is_host() const {
     bool has_non_host_devices = false;
     _impl->devices.for_each_device([&](rt::device_id d) {
@@ -94,29 +128,35 @@ class context
   }
 
   platform get_platform() const {
-    bool found_device_backend = false;
-    rt::backend_id last_backend;
+    bool found_device_platform = false;
+    rt::platform_id last_platform;
 
     this->_impl->devices.for_each_backend([&](rt::backend_id b) {
-      if (b != detail::get_host_device().get_backend()) {
-        if (found_device_backend) {
-          // We already have a device backend
-          HIPSYCL_DEBUG_WARNING
-              << "context: get_platform() was called but this context spans "
-                 "multiple backends/platforms. Only returning last platform"
-              << std::endl;
+      rt::backend* backend = this->_impl->requires_runtime.get()->backends().get(b);
+
+      for (std::size_t platform_index = 0;
+           platform_index < backend->get_hardware_manager()->get_num_platforms();
+           ++platform_index) {
+        if (b != detail::get_host_device().get_backend()) {
+          if (found_device_platform) {
+            // We already have a device backend
+            HIPSYCL_DEBUG_WARNING
+                << "context: get_platform() was called but this context spans "
+                  "multiple backends/platforms. Only returning last platform"
+                << std::endl;
+          }
+          
+          last_platform = rt::platform_id{b, platform_index};
+          found_device_platform = true;
         }
-        
-        last_backend = b;
-        found_device_backend = true;
       }
     });
 
-    if (!found_device_backend) {
-      last_backend = detail::get_host_device().get_backend(); 
+    if (!found_device_platform) {
+      last_platform = rt::platform_id{detail::get_host_device().get_backend(), 0}; 
     }
 
-    return platform{last_backend};
+    return platform{last_platform};
   }
 
   vector_class<device> get_devices() const {
@@ -135,11 +175,26 @@ class context
   }
 
   std::size_t AdaptiveCpp_hash_code() const {
+    if(_impl && _impl->is_default_context) {
+      std::size_t hash = 0;
+      _impl->devices.for_each_device([&](rt::device_id dev){
+        // xor ensures that device order does not matter
+        hash ^= dev.hash_code();
+      });
+      return hash;
+    }
     return std::hash<void*>{}(_impl.get());
   }
 
-  friend bool operator ==(const context& lhs, const context& rhs)
-  { return lhs._impl == rhs._impl; }
+  friend bool operator ==(const context& lhs, const context& rhs) {
+
+    if (lhs._impl && rhs._impl && lhs._impl->is_default_context &&
+        rhs._impl->is_default_context) {
+      return lhs._impl->devices == rhs._impl->devices;
+    }
+
+    return lhs._impl == rhs._impl;
+  }
 
   friend bool operator!=(const context& lhs, const context &rhs)
   { return !(lhs == rhs); }
@@ -148,7 +203,6 @@ class context
     return _impl->requires_runtime.get();
   }
 
-
   [[deprecated("Use AdaptiveCpp_hash_code()")]]
   auto hipSYCL_hash_code() const {
     return AdaptiveCpp_hash_code();
@@ -179,7 +233,8 @@ class context
 
     context_impl() : devices{requires_runtime.get()} {}
 
-    async_handler handler;    
+    async_handler handler;
+    bool is_default_context = false;
   };
 
   std::shared_ptr<context_impl> _impl;
@@ -203,6 +258,10 @@ inline const rt::unique_device_list &extract_context_devices(const context &ctx)
 
 }
 
+inline context platform::khr_get_default_context() const {
+  return context{detail::default_context_tag_t{}, *this};
+}
+
 inline exception::exception(context ctx, std::error_code ec, const std::string& what_arg)
   : _context{std::make_shared<context>(ctx)}, error_code{ec},
     _msg{what_arg} {}
diff --git a/include/hipSYCL/sycl/detail/namespace_compat.hpp b/include/hipSYCL/sycl/detail/namespace_compat.hpp
new file mode 100644
index 000000000..9d9b672c5
--- /dev/null
+++ b/include/hipSYCL/sycl/detail/namespace_compat.hpp
@@ -0,0 +1,25 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef ACPP_NAMESPACE_COMPAT
+#define ACPP_NAMESPACE_COMPAT
+
+#ifndef ACPP_NO_SHORT_NAMESPACE
+namespace acpp {
+  using namespace hipsycl;
+}
+#endif
+
+namespace adaptivecpp {
+  using namespace hipsycl;
+}
+
+#endif
diff --git a/include/hipSYCL/sycl/device_selector.hpp b/include/hipSYCL/sycl/device_selector.hpp
index c6a37d299..19b993a8d 100644
--- a/include/hipSYCL/sycl/device_selector.hpp
+++ b/include/hipSYCL/sycl/device_selector.hpp
@@ -108,7 +108,14 @@ inline int select_default(const device& dev) {
   } else if(dev.is_cpu()) {
     // Prefer CPU over GPUs that don't have compiled kernels
     // and cannot run kernels.
-    return 1;
+
+    // Prefer non-OpenMP CPU device since the OpenMP backend cannot be disabled,
+    // so there would be no way to select e.g. an OpenCL CPU device
+    // using ACPP_VISIBILITY_MASK otherwise.
+    if(dev.get_backend() != sycl::backend::omp)
+      return 1;
+    else
+      return 0;
   } else {
     // Never select GPUs without compiled kernels
     return -1;
diff --git a/include/hipSYCL/sycl/extensions.hpp b/include/hipSYCL/sycl/extensions.hpp
index c88d284fd..5456a4b43 100644
--- a/include/hipSYCL/sycl/extensions.hpp
+++ b/include/hipSYCL/sycl/extensions.hpp
@@ -74,5 +74,11 @@
 #define ACPP_EXT_QUEUE_PRIORITY
 #define ACPP_EXT_SPECIALIZED
 #define ACPP_EXT_DYNAMIC_FUNCTIONS
+#define ACPP_EXT_RESTRICT_PTR
+#define ACPP_EXT_JIT_COMPILE_IF
+
+// KHR extensions
+
+#define SYCL_KHR_DEFAULT_CONTEXT 1
 
 #endif
diff --git a/include/hipSYCL/sycl/is_device_copyable.hpp b/include/hipSYCL/sycl/is_device_copyable.hpp
new file mode 100644
index 000000000..29e3c5276
--- /dev/null
+++ b/include/hipSYCL/sycl/is_device_copyable.hpp
@@ -0,0 +1,61 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_IS_DEVICE_COPYABLE_HPP
+#define HIPSYCL_IS_DEVICE_COPYABLE_HPP
+
+#include <array>
+#include <optional>
+#include <utility>
+#include <tuple>
+#include <type_traits>
+
+// AdaptiveCPP does not use this type trait to restrict allowed
+// arguments to a kernel - this is simply provided for compatibility.
+
+#define SYCL_DEVICE_COPYABLE 1
+
+namespace hipsycl {
+namespace sycl {
+
+template <typename T> struct is_device_copyable;
+
+namespace detail {
+template <typename T, typename = void>
+struct is_device_copyable_impl : std::is_trivially_copyable<T> {};
+
+template <typename T>
+struct is_device_copyable_impl<T, std::enable_if_t<!std::is_same_v<T, std::remove_cv_t<T>>>> : is_device_copyable<std::remove_cv_t<T>> {};
+}
+
+template <typename T> struct is_device_copyable : detail::is_device_copyable_impl<T> {};
+
+template<typename T>
+inline constexpr bool is_device_copyable_v = is_device_copyable<T>::value;
+
+template <typename T>
+struct is_device_copyable<std::array<T, 0>> : std::true_type {};
+
+template <typename T, std::size_t N>
+struct is_device_copyable<std::array<T, N>> : is_device_copyable<T> {};
+
+template <typename T>
+struct is_device_copyable<std::optional<T>> : is_device_copyable<T> {};
+
+template <typename T1, typename T2>
+struct is_device_copyable<std::pair<T1, T2>> : std::bool_constant<is_device_copyable_v<T1> && is_device_copyable_v<T2>> {};
+
+template <typename... Ts>
+struct is_device_copyable<std::tuple<Ts...>> : std::bool_constant<(... && is_device_copyable_v<Ts>)> {};
+
+}
+}
+
+#endif
diff --git a/include/hipSYCL/sycl/jit.hpp b/include/hipSYCL/sycl/jit.hpp
index c81463ee4..e9e72a583 100644
--- a/include/hipSYCL/sycl/jit.hpp
+++ b/include/hipSYCL/sycl/jit.hpp
@@ -20,6 +20,7 @@
 #if ACPP_LIBKERNEL_IS_DEVICE_PASS_SSCP
 #include "hipSYCL/glue/reflection.hpp"
 #include "hipSYCL/glue/llvm-sscp/fcall_specialization.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/stable_running_hash.hpp"
 #include "hipSYCL/common/unordered_dense.hpp"
 #include "exception.hpp"
@@ -35,7 +36,7 @@ extern "C" void __acpp_function_annotation_dynamic_function_def_arg1();
 template<class T>
 void __acpp_function_annotation_argument_used(T&& x);
 
-namespace hipsycl::sycl::jit {
+namespace hipsycl::sycl::AdaptiveCpp_jit {
 
 template<class T>
 void arguments_are_used(T&& x) {
@@ -274,7 +275,21 @@ class dynamic_function_config {
 
 }
 
+#else // IS_DEVICE_PASS_SSCP
+
+// Define at least the namespace so that users can set global aliases
+// for convenience, instead of only being able to define them inside
+// __acpp_if_target_sscp().
+namespace hipsycl::sycl::AdaptiveCpp_jit {}
 
 #endif // IS_DEVICE_PASS_SSCP
 
+// Set jit alias for convenience. If SYCL ever claims this namespace
+// we will have to remove it, so this is not currently publicly advertised.
+// However, it aligns with certain early examples that were published around
+// our JIT capabilities - if users try those, we need this bit.
+namespace hipsycl::sycl::jit {
+using namespace hipsycl::sycl::AdaptiveCpp_jit;
+}
+
 #endif
diff --git a/include/hipSYCL/sycl/libkernel/accessor.hpp b/include/hipSYCL/sycl/libkernel/accessor.hpp
index 25a5b3829..b28631885 100644
--- a/include/hipSYCL/sycl/libkernel/accessor.hpp
+++ b/include/hipSYCL/sycl/libkernel/accessor.hpp
@@ -44,6 +44,7 @@
 #include "item.hpp"
 #include "multi_ptr.hpp"
 #include "atomic.hpp"
+#include "../specialized.hpp"
 #include "detail/local_memory_allocator.hpp"
 #include "detail/mobile_shared_ptr.hpp"
 
@@ -2148,8 +2149,8 @@ class accessor<
     : _addr{addr}, _num_elements{r}
   {}
 
-  address _addr{};
-  range<dimensions> _num_elements;
+  specialized<address> _addr;
+  specialized<range<dimensions>> _num_elements;
 };
 
 namespace detail::accessor {
diff --git a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
index 8becda6f5..f4e5c42b4 100644
--- a/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
+++ b/include/hipSYCL/sycl/libkernel/generic/hiplike/atomic_builtins.hpp
@@ -17,6 +17,7 @@
 
 #include <type_traits>
 
+
 #if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA ||                                   \
     ACPP_LIBKERNEL_IS_DEVICE_PASS_HIP
 
@@ -40,10 +41,62 @@ inline constexpr int builtin_memory_order(memory_order o) noexcept {
   return __ATOMIC_RELAXED;
 }
 
+#if ACPP_LIBKERNEL_IS_DEVICE_PASS_CUDA && !defined(ACPP_LIBKERNEL_CUDA_NVCXX)
+ #define ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
+#endif
+
+#ifdef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
+// LLVM NVPTX backend does currently not properly support acquire/release
+// atomics. We workaround this for two load/store instructions that we
+// need for the algorithms library using inline assembly.
+__attribute__((always_inline)) 
+HIPSYCL_HIPLIKE_BUILTIN
+void __acpp_cuda_atomic_store_device_rel_i32(int32_t *ptr, int32_t x) {
+#if __CUDA_ARCH__ < 700
+  __threadfence();
+  *ptr = x;
+  __threadfence();
+#else
+  asm volatile("st.release.gpu.s32 [%0], %1;"
+              :
+              :"l"(ptr), "r"(x)
+              : "memory");
+#endif
+}
+
+__attribute__((always_inline))
+HIPSYCL_HIPLIKE_BUILTIN
+int32_t __acpp_cuda_atomic_load_device_acq_i32(int32_t *ptr) {
+#if __CUDA_ARCH__ < 700
+  __threadfence();
+  int32_t res = *ptr;
+  __threadfence();
+  return res;
+#else
+  int32_t result;
+  asm volatile("ld.acquire.gpu.u32 %0,[%1];"
+                : "=r"(result)
+                : "l"(ptr)
+                : "memory");
+  return result;  
+#endif
+}
+
+#endif
+
 template <access::address_space S, class T>
 HIPSYCL_HIPLIKE_BUILTIN void
 __acpp_atomic_store(T *addr, T x, memory_order order,
                        memory_scope scope) noexcept {
+  if constexpr(sizeof(T) == sizeof(int32_t)) {
+#ifdef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
+    if(scope == memory_scope::device && order == memory_order::release){
+      __acpp_cuda_atomic_store_device_rel_i32(reinterpret_cast<int32_t*>(addr),
+                                              bit_cast<int32_t>(x));
+      return;
+    }
+#endif
+  }
   __atomic_store_n(addr, x, builtin_memory_order(order));
 }
 
@@ -66,6 +119,14 @@ __acpp_atomic_store(double *addr, double x, memory_order order,
 template <access::address_space S, class T>
 HIPSYCL_HIPLIKE_BUILTIN T __acpp_atomic_load(T *addr, memory_order order,
                                                 memory_scope scope) noexcept {
+  if constexpr(sizeof(T) == sizeof(int32_t)) {
+#ifdef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
+    if(scope == memory_scope::device && order == memory_order::acquire){
+      return bit_cast<T>(__acpp_cuda_atomic_load_device_acq_i32(
+          reinterpret_cast<int32_t*>(addr)));
+    }
+#endif
+  }
   return __atomic_load_n(addr, builtin_memory_order(order));
 }
 
@@ -493,6 +554,8 @@ __acpp_atomic_fetch_max(double *addr, double x, memory_order order,
 }
 }
 
+#undef ACPP_NEEDS_CUDA_ATOMIC_WORKAROUNDS
+
 #endif
 
 #endif
diff --git a/include/hipSYCL/sycl/libkernel/group_functions.hpp b/include/hipSYCL/sycl/libkernel/group_functions.hpp
index 6df05afb5..22224f677 100644
--- a/include/hipSYCL/sycl/libkernel/group_functions.hpp
+++ b/include/hipSYCL/sycl/libkernel/group_functions.hpp
@@ -186,7 +186,8 @@ OutPtr joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
                                           first, last, result, binary_op);
 }
 
-template<class Group, typename V, typename T, typename BinaryOperation>
+template<class Group, typename V, typename T, typename BinaryOperation,
+          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
 HIPSYCL_BUILTIN
 T exclusive_scan_over_group(Group g, V x, T init, BinaryOperation binary_op) {
   HIPSYCL_RETURN_DISPATCH_GROUP_ALGORITHM(__acpp_exclusive_scan_over_group,
diff --git a/include/hipSYCL/sycl/libkernel/restrict.hpp b/include/hipSYCL/sycl/libkernel/restrict.hpp
new file mode 100644
index 000000000..d92011931
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/restrict.hpp
@@ -0,0 +1,69 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef ACPP_RESTRICT_HPP
+#define ACPP_RESTRICT_HPP
+
+#include <type_traits>
+
+namespace hipsycl {
+namespace sycl {
+
+namespace detail {
+
+template <class T>
+struct __acpp_sscp_emit_param_type_annotation_restrict {
+  T value;
+};
+
+} // namespace detail
+
+template <class T> class AdaptiveCpp_restrict_ptr {
+public:
+  template <typename U = T, typename = std::enable_if_t<
+                                std::is_default_constructible<U>::value>>
+  AdaptiveCpp_restrict_ptr() : _value{} {}
+
+  AdaptiveCpp_restrict_ptr(const T &value) : _value{value} {}
+
+  AdaptiveCpp_restrict_ptr(const AdaptiveCpp_restrict_ptr<T> &other)
+      : _value{other._value.value} {}
+
+  AdaptiveCpp_restrict_ptr(sycl::AdaptiveCpp_restrict_ptr<T> &&other) {
+    swap(*this, other);
+  }
+
+  AdaptiveCpp_restrict_ptr<T> &operator=(const T &value) {
+    AdaptiveCpp_restrict_ptr<T> tmp{value};
+    swap(*this, tmp);
+    return *this;
+  }
+
+  AdaptiveCpp_restrict_ptr<T> &operator=(AdaptiveCpp_restrict_ptr<T> other) {
+    swap(*this, other);
+    return *this;
+  }
+
+  friend void swap(AdaptiveCpp_restrict_ptr<T> &first,
+                   AdaptiveCpp_restrict_ptr<T> &second) {
+    using std::swap;
+    swap(first._value.value, second._value.value);
+  }
+
+  operator T *() const { return _value.value; }
+
+private:
+  detail::__acpp_sscp_emit_param_type_annotation_restrict<T *> _value;
+};
+
+} // namespace sycl
+} // namespace hipsycl
+
+#endif
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp
index dc9c1c490..996ca4977 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp
@@ -8,11 +8,10 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include "builtin_config.hpp"
-
 #ifndef HIPSYCL_SSCP_BROADCAST_BUILTINS_HPP
 #define HIPSYCL_SSCP_BROADCAST_BUILTINS_HPP
 
+#include "builtin_config.hpp"
 
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_work_group_broadcast_i8(__acpp_int32 sender,
@@ -29,7 +28,6 @@ HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int64 __acpp_sscp_work_group_broadcast_i64(__acpp_int32 sender,
                                                         __acpp_int64 x);
 
-
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_sub_group_broadcast_i8(__acpp_int32 sender,
                                                      __acpp_int8 x);
@@ -45,4 +43,6 @@ HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int64 __acpp_sscp_sub_group_broadcast_i64(__acpp_int32 sender,
                                                        __acpp_int64 x);
 
+
 #endif
+
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp
index bc19c166c..39b678bb5 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/core.hpp
@@ -12,6 +12,7 @@
 #define HIPSYCL_SSCP_BUILTINS_CORE_HPP
 
 #include "builtin_config.hpp"
+#include "core_typed.hpp"
 
 #include <stddef.h>
 
@@ -38,171 +39,6 @@ HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_z();
 HIPSYCL_SSCP_BUILTIN bool
 __acpp_sscp_if_global_sizes_fit_in_int();
 
-template<int Dim, class T>
-T __acpp_sscp_typed_get_global_linear_id() {
-  if constexpr(Dim == 1) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-
-    return gid_x * lsize_x + lid_x;
-  } else if constexpr(Dim == 2) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lsize_y = (T)__acpp_sscp_get_local_size_y();
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-
-    T id_x = gid_x * lsize_x + lid_x; 
-    T id_y = gid_y * lsize_y + lid_y;
-
-    T global_size_x = lsize_x * ngroups_x;
-
-    return global_size_x * id_y + id_x;
-  } else if constexpr(Dim == 3) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-    T gid_z = (T)__acpp_sscp_get_group_id_z();
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lsize_y = (T)__acpp_sscp_get_local_size_y();
-    T lsize_z = (T)__acpp_sscp_get_local_size_z();
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-    T lid_z = (T)__acpp_sscp_get_local_id_z();
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
-    
-    T id_x = gid_x * lsize_x + lid_x;
-    T id_y = gid_y * lsize_y + lid_y;
-    T id_z = gid_z * lsize_z + lid_z;
-
-    T global_size_x = lsize_x * ngroups_x;
-    T global_size_y = lsize_y * ngroups_y;
-
-    return global_size_x * global_size_y * id_z + global_size_x * id_y + id_x;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_local_linear_id() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_local_id_x();
-  } else if constexpr(Dim == 2) {
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-
-    return lsize_x * lid_y + lid_x;
-  } else if constexpr(Dim == 3) {
-    T lid_x = (T)__acpp_sscp_get_local_id_x();
-    T lid_y = (T)__acpp_sscp_get_local_id_y();
-    T lid_z = (T)__acpp_sscp_get_local_id_z();
-
-    T lsize_x = (T)__acpp_sscp_get_local_size_x();
-    T lsize_y = (T)__acpp_sscp_get_local_size_y();
-
-    return lsize_x * lsize_y * lid_z + lsize_x * lid_y + lid_x;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_group_linear_id() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_group_id_x();
-  } else if constexpr(Dim == 2) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-
-    return ngroups_x * gid_y + gid_x;
-  } else if constexpr(Dim == 3) {
-    T gid_x = (T)__acpp_sscp_get_group_id_x();
-    T gid_y = (T)__acpp_sscp_get_group_id_y();
-    T gid_z = (T)__acpp_sscp_get_group_id_z();
-
-    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
-    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
-
-    return ngroups_x * ngroups_y * gid_z + ngroups_x * gid_y + gid_x;
-  } else {
-    return 0;
-  }
-}
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_global_size() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
-  } else if constexpr(Dim == 2) {
-    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
-
-    return size_x * size_y;
-  } else if constexpr(Dim == 3) {
-    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
-    T size_z = (T)__acpp_sscp_get_local_size_z() * (T)__acpp_sscp_get_num_groups_z();
-
-    return size_x * size_y * size_z;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_local_size() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_local_size_x();
-  } else if constexpr(Dim == 2) {
-    T size_x = (T)__acpp_sscp_get_local_size_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y();
-
-    return size_x * size_y;
-  } else if constexpr(Dim == 3) {
-    T size_x = (T)__acpp_sscp_get_local_size_x();
-    T size_y = (T)__acpp_sscp_get_local_size_y();
-    T size_z = (T)__acpp_sscp_get_local_size_z();
-
-    return size_x * size_y * size_z;
-  } else {
-    return 0;
-  }
-}
-
-
-template<int Dim, class T>
-T __acpp_sscp_typed_get_num_groups() {
-  if constexpr(Dim == 1) {
-    return (T)__acpp_sscp_get_num_groups_x();
-  } else if constexpr(Dim == 2) {
-    T size_x = (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_num_groups_y();
-
-    return size_x * size_y;
-  } else if constexpr(Dim == 3) {
-    T size_x = (T)__acpp_sscp_get_num_groups_x();
-    T size_y = (T)__acpp_sscp_get_num_groups_y();
-    T size_z = (T)__acpp_sscp_get_num_groups_z();
-
-    return size_x * size_y * size_z;
-  } else {
-    return 0;
-  }
-}
-
-
-
 template<int Dim>
 size_t __acpp_sscp_get_global_linear_id() {
   if(__acpp_sscp_if_global_sizes_fit_in_int()) {
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/core_typed.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/core_typed.hpp
new file mode 100644
index 000000000..5d607e722
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/core_typed.hpp
@@ -0,0 +1,198 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_BUILTINS_CORE_TYPED_HPP
+#define HIPSYCL_SSCP_BUILTINS_CORE_TYPED_HPP
+
+#include "builtin_config.hpp"
+
+#include <stddef.h>
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_id_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_id_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_id_z();
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_group_id_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_group_id_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_group_id_z();
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_size_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_size_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_local_size_z();
+
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_x();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_y();
+HIPSYCL_SSCP_BUILTIN __acpp_uint64 __acpp_sscp_get_num_groups_z();
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_global_linear_id() {
+  if constexpr(Dim == 1) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+
+    return gid_x * lsize_x + lid_x;
+  } else if constexpr(Dim == 2) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lsize_y = (T)__acpp_sscp_get_local_size_y();
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+
+    T id_x = gid_x * lsize_x + lid_x; 
+    T id_y = gid_y * lsize_y + lid_y;
+
+    T global_size_x = lsize_x * ngroups_x;
+
+    return global_size_x * id_y + id_x;
+  } else if constexpr(Dim == 3) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+    T gid_z = (T)__acpp_sscp_get_group_id_z();
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lsize_y = (T)__acpp_sscp_get_local_size_y();
+    T lsize_z = (T)__acpp_sscp_get_local_size_z();
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+    T lid_z = (T)__acpp_sscp_get_local_id_z();
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
+    
+    T id_x = gid_x * lsize_x + lid_x;
+    T id_y = gid_y * lsize_y + lid_y;
+    T id_z = gid_z * lsize_z + lid_z;
+
+    T global_size_x = lsize_x * ngroups_x;
+    T global_size_y = lsize_y * ngroups_y;
+
+    return global_size_x * global_size_y * id_z + global_size_x * id_y + id_x;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_local_linear_id() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_local_id_x();
+  } else if constexpr(Dim == 2) {
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+
+    return lsize_x * lid_y + lid_x;
+  } else if constexpr(Dim == 3) {
+    T lid_x = (T)__acpp_sscp_get_local_id_x();
+    T lid_y = (T)__acpp_sscp_get_local_id_y();
+    T lid_z = (T)__acpp_sscp_get_local_id_z();
+
+    T lsize_x = (T)__acpp_sscp_get_local_size_x();
+    T lsize_y = (T)__acpp_sscp_get_local_size_y();
+
+    return lsize_x * lsize_y * lid_z + lsize_x * lid_y + lid_x;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_group_linear_id() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_group_id_x();
+  } else if constexpr(Dim == 2) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+
+    return ngroups_x * gid_y + gid_x;
+  } else if constexpr(Dim == 3) {
+    T gid_x = (T)__acpp_sscp_get_group_id_x();
+    T gid_y = (T)__acpp_sscp_get_group_id_y();
+    T gid_z = (T)__acpp_sscp_get_group_id_z();
+
+    T ngroups_x = (T)__acpp_sscp_get_num_groups_x();
+    T ngroups_y = (T)__acpp_sscp_get_num_groups_y();
+
+    return ngroups_x * ngroups_y * gid_z + ngroups_x * gid_y + gid_x;
+  } else {
+    return 0;
+  }
+}
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_global_size() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
+  } else if constexpr(Dim == 2) {
+    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
+
+    return size_x * size_y;
+  } else if constexpr(Dim == 3) {
+    T size_x = (T)__acpp_sscp_get_local_size_x() * (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y() * (T)__acpp_sscp_get_num_groups_y();
+    T size_z = (T)__acpp_sscp_get_local_size_z() * (T)__acpp_sscp_get_num_groups_z();
+
+    return size_x * size_y * size_z;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_local_size() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_local_size_x();
+  } else if constexpr(Dim == 2) {
+    T size_x = (T)__acpp_sscp_get_local_size_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y();
+
+    return size_x * size_y;
+  } else if constexpr(Dim == 3) {
+    T size_x = (T)__acpp_sscp_get_local_size_x();
+    T size_y = (T)__acpp_sscp_get_local_size_y();
+    T size_z = (T)__acpp_sscp_get_local_size_z();
+
+    return size_x * size_y * size_z;
+  } else {
+    return 0;
+  }
+}
+
+
+template<int Dim, class T>
+T __acpp_sscp_typed_get_num_groups() {
+  if constexpr(Dim == 1) {
+    return (T)__acpp_sscp_get_num_groups_x();
+  } else if constexpr(Dim == 2) {
+    T size_x = (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_num_groups_y();
+
+    return size_x * size_y;
+  } else if constexpr(Dim == 3) {
+    T size_x = (T)__acpp_sscp_get_num_groups_x();
+    T size_y = (T)__acpp_sscp_get_num_groups_y();
+    T size_z = (T)__acpp_sscp_get_num_groups_z();
+
+    return size_x * size_y * size_z;
+  } else {
+    return 0;
+  }
+}
+
+
+#endif
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp
new file mode 100644
index 000000000..c5beda9da
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp
@@ -0,0 +1,41 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_BROADCAST_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_BROADCAST_BUILTINS_HPP
+
+#include "../barrier.hpp"
+#include "../broadcast.hpp"
+#include "../builtin_config.hpp"
+#include "../core_typed.hpp"
+#include "../shuffle.hpp"
+#include "utils.hpp"
+
+#undef ACPP_TEMPLATE_DECLARATION_WG_BROADCAST
+
+namespace hipsycl::libkernel::sscp {
+
+template <typename T, typename V> T wg_broadcast(__acpp_int32 sender, T x, V shrd_memory) {
+
+  if (sender == __acpp_sscp_typed_get_local_linear_id<3, int>()) {
+    shrd_memory[0] = x;
+  };
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  x = shrd_memory[0];
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  return x;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
new file mode 100644
index 000000000..8fa70e77f
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp
@@ -0,0 +1,109 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_REDUCTION_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_REDUCTION_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+namespace {
+template <typename OutType, typename BinaryOperation>
+OutType sg_reduce_impl(OutType x, BinaryOperation binary_op, __acpp_int32 active_threads) {
+  const __acpp_uint32 lrange = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_uint32 lid = __acpp_sscp_get_subgroup_local_id();
+  const __acpp_uint64 subgroup_size = active_threads;
+  auto local_x = x;
+  for (__acpp_int32 i = lrange / 2; i > 0; i /= 2) {
+    auto other_x = __builtin_bit_cast(
+        OutType,
+        sg_select(__builtin_bit_cast(typename integer_type<OutType>::type, local_x), lid + i));
+    if (lid + i < subgroup_size)
+      local_x = binary_op(local_x, other_x);
+  }
+  return __builtin_bit_cast(
+      OutType, sg_select(__builtin_bit_cast(typename integer_type<OutType>::type, local_x), 0));
+}
+} // namespace
+
+template <__acpp_sscp_algorithm_op binary_op, typename OutType> OutType sg_reduce(OutType x) {
+  using op = typename get_op<binary_op>::type;
+  const __acpp_uint32 lrange = __acpp_sscp_get_subgroup_size();
+  return sg_reduce_impl(x, op{}, lrange);
+}
+
+template <size_t shmem_array_length, typename OutType, typename MemoryType,
+          typename BinaryOperation>
+OutType wg_reduce(OutType x, BinaryOperation op, MemoryType *shrd_mem) {
+
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 wg_size = __acpp_sscp_typed_get_local_size<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+  const __acpp_int32 first_sg_size = wg_broadcast(0, sg_size, &shrd_mem[0]);
+
+  const __acpp_uint32 num_subgroups = (wg_size + max_sg_size - 1) / max_sg_size;
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  OutType local_reduce_result = sg_reduce_impl(x, op, sg_size);
+
+  // Sum up until all sgs can load their data into shmem
+  if (subgroup_id < shmem_array_length) {
+    shrd_mem[subgroup_id] = local_reduce_result;
+  }
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+
+  for (int i = shmem_array_length; i < num_subgroups; i += shmem_array_length) {
+    if (subgroup_id >= i && subgroup_id < i + shmem_array_length) {
+      shrd_mem[subgroup_id % shmem_array_length] =
+          op(local_reduce_result, shrd_mem[subgroup_id % shmem_array_length]);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+  }
+
+  // Now we are filled up shared memory with the results of all the subgroups
+  // We reduce in shared memory until it fits into one sg
+  size_t elements_in_shmem =
+      num_subgroups < shmem_array_length ? num_subgroups : shmem_array_length;
+  for (int i = shmem_array_length / 2; i >= first_sg_size; i /= 2) {
+    if (wg_lid < i && wg_lid + i < elements_in_shmem) {
+      shrd_mem[wg_lid] = op(shrd_mem[wg_lid + i], shrd_mem[wg_lid]);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+  }
+
+  // Now we load the data into registers
+  if (wg_lid < first_sg_size) {
+    local_reduce_result = shrd_mem[wg_lid];
+    int active_threads = num_subgroups < first_sg_size ? num_subgroups : first_sg_size;
+    local_reduce_result = sg_reduce_impl(local_reduce_result, op, active_threads);
+  }
+
+  // Do a final broadcast
+  using internal_type = typename integer_type<OutType>::type;
+  static_assert(sizeof(internal_type) == sizeof(OutType));
+  local_reduce_result = __builtin_bit_cast(
+      OutType,
+      wg_broadcast(0, __builtin_bit_cast(internal_type, local_reduce_result), &shrd_mem[0]));
+  return local_reduce_result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp
new file mode 100644
index 000000000..9260d999e
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp
@@ -0,0 +1,97 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_GENEIC_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_GENEIC_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "scan_subgroup.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <int SharedMemorySize, bool ExclusiveScan, typename OutType, typename MemoryType,
+          typename BinaryOperation>
+OutType wg_generic_scan(OutType x, BinaryOperation op, MemoryType shrd_mem, OutType init = 0) {
+
+  // The last element of the shared memory is used to store the total sum for exclusive scans.
+  const size_t shmem_array_length = SharedMemorySize - 1;
+
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 wg_size = __acpp_sscp_typed_get_local_size<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+
+  const __acpp_uint32 num_subgroups = (wg_size + max_sg_size - 1) / max_sg_size;
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  const bool last_item_in_sg = (wg_lid % sg_size) == (sg_size - 1);
+  OutType sg_scan_result;
+  if constexpr (ExclusiveScan) {
+    sg_scan_result = sg_exclusive_scan(x, op, init);
+  } else {
+    sg_scan_result = sg_inclusive_scan(x, op);
+  }
+
+  for (int i = 0; i < (num_subgroups - 1 + shmem_array_length) / shmem_array_length; i++) {
+    __acpp_uint32 first_active_thread = i * num_subgroups * max_sg_size;
+    __acpp_uint32 last_active_thread = (i + 1) * num_subgroups * max_sg_size;
+    last_active_thread = last_active_thread > wg_size ? wg_size : last_active_thread;
+    __acpp_uint32 relative_thread_id = wg_lid - first_active_thread;
+    if (subgroup_id / shmem_array_length == i) {
+      if (last_item_in_sg) {
+
+        if constexpr (ExclusiveScan) {
+          shrd_mem[subgroup_id % shmem_array_length] = op(sg_scan_result, x);
+        } else {
+          shrd_mem[subgroup_id % shmem_array_length] = sg_scan_result;
+        }
+      }
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    // First shmem_array_length number of threads exclusive scan in shared memory
+    auto local_x = shrd_mem[relative_thread_id];
+    for (__acpp_int32 j = 1; j < shmem_array_length; j *= 2) {
+      __acpp_int32 next_id = relative_thread_id - j;
+      if (next_id >= 0 && j <= relative_thread_id) {
+        if (relative_thread_id < shmem_array_length) {
+          auto other_x = shrd_mem[next_id];
+          local_x = op(local_x, other_x);
+          shrd_mem[relative_thread_id] = local_x;
+        }
+      }
+      __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                     __acpp_sscp_memory_order::relaxed);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+
+    if (subgroup_id > 0) {
+      auto current_segment_update = shrd_mem[(subgroup_id % shmem_array_length) - 1];
+      sg_scan_result = op(current_segment_update, sg_scan_result);
+    }
+    if (i > 0) {
+      sg_scan_result = op(shrd_mem[shmem_array_length], sg_scan_result);
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    shrd_mem[shmem_array_length] = sg_scan_result;
+  }
+  return sg_scan_result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp
new file mode 100644
index 000000000..111414f96
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp
@@ -0,0 +1,61 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_HIPLIKE_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_HIPLIKE_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "scan_subgroup.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <int SharedMemorySize, bool ExclusiveScan, typename OutType, typename MemoryType,
+          typename BinaryOperation>
+OutType wg_hiplike_scan(OutType x, BinaryOperation op, MemoryType shrd_mem, OutType init = 0) {
+
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  const bool last_item_in_sg = (wg_lid % sg_size) == (sg_size - 1);
+  OutType sg_scan_result;
+  if constexpr (ExclusiveScan) {
+    sg_scan_result = sg_exclusive_scan(x, op, init);
+  } else {
+    sg_scan_result = sg_inclusive_scan(x, op);
+  }
+
+  if (last_item_in_sg) {
+    if constexpr (ExclusiveScan) {
+      shrd_mem[subgroup_id] = op(sg_scan_result, x);
+    } else {
+      shrd_mem[subgroup_id] = sg_scan_result;
+    }
+  }
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  if (subgroup_id == 0) {
+    shrd_mem[wg_lid] = sg_inclusive_scan(shrd_mem[wg_lid], op);
+  }
+  __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                 __acpp_sscp_memory_order::relaxed);
+  return subgroup_id > 0 ? op(shrd_mem[subgroup_id - 1], sg_scan_result) : sg_scan_result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp
new file mode 100644
index 000000000..18dff327c
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp
@@ -0,0 +1,75 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_HOST_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_HOST_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "scan_subgroup.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <bool ExclusiveScan, typename OutType, typename MemoryType, typename BinaryOperation>
+OutType wg_host_scan(OutType x, BinaryOperation op, MemoryType shrd_mem, OutType init = 0) {
+  const __acpp_uint32 wg_lid = __acpp_sscp_typed_get_local_linear_id<3, int>();
+  const __acpp_uint32 wg_size = __acpp_sscp_typed_get_local_size<3, int>();
+  const __acpp_uint32 max_sg_size = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_int32 sg_size = __acpp_sscp_get_subgroup_size();
+
+  const __acpp_uint32 num_subgroups = (wg_size + max_sg_size - 1) / max_sg_size;
+  const __acpp_uint32 subgroup_id = wg_lid / max_sg_size;
+
+  const bool last_item_in_sg = (wg_lid % sg_size) == (sg_size - 1);
+  OutType local_x;
+  if constexpr (ExclusiveScan) {
+    if (wg_lid + 1 < wg_size) {
+      shrd_mem[wg_lid + 1] = x;
+    } else {
+      shrd_mem[0] = init;
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    local_x = shrd_mem[wg_lid];
+  } else {
+    shrd_mem[wg_lid] = x;
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+    local_x = x;
+  }
+
+  OutType other_x;
+  // TODO: Here we can just call the host inclusive scan
+  for (__acpp_int32 i = 1; i < wg_size; i *= 2) {
+    __acpp_int32 next_id = wg_lid - i;
+    bool is_nextid_valid = (next_id >= 0) && (i <= wg_lid);
+
+    if (is_nextid_valid) {
+      other_x = shrd_mem[next_id];
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+
+    if (is_nextid_valid) {
+      local_x = op(local_x, other_x);
+      shrd_mem[wg_lid] = local_x;
+    }
+    __acpp_sscp_work_group_barrier(__acpp_sscp_memory_scope::work_group,
+                                   __acpp_sscp_memory_order::relaxed);
+  }
+  return local_x;
+}
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
new file mode 100644
index 000000000..5f79c49ed
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp
@@ -0,0 +1,53 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#ifndef HIPSYCL_SSCP_DETAIL_SUBGROUP_SCAN_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_SUBGROUP_SCAN_BUILTINS_HPP
+
+#include "../core_typed.hpp"
+#include "../subgroup.hpp"
+#include "broadcast.hpp"
+#include "shuffle.hpp"
+#include "utils.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <typename T, typename BinaryOperation>
+T sg_inclusive_scan(T x, BinaryOperation binary_op) {
+  const __acpp_uint32 lid = __acpp_sscp_get_subgroup_local_id();
+  const __acpp_uint32 lrange = __acpp_sscp_get_subgroup_max_size();
+  const __acpp_uint64 subgroup_size = __acpp_sscp_get_subgroup_size();
+  auto local_x = x;
+  for (__acpp_int32 i = 1; i < lrange; i *= 2) {
+    __acpp_uint32 next_id = lid - i;
+    auto other_x = __builtin_bit_cast(
+        T, sg_shift_right(__builtin_bit_cast(typename integer_type<T>::type, local_x), i));
+    if (next_id >= 0 && i <= lid)
+      local_x = binary_op(local_x, other_x);
+  }
+  return local_x;
+}
+
+template <typename T, typename BinaryOperation>
+T sg_exclusive_scan(T x, BinaryOperation binary_op, T init) {
+  const __acpp_uint32 lid = __acpp_sscp_get_subgroup_local_id();
+  const __acpp_uint64 subgroup_size = __acpp_sscp_get_subgroup_max_size();
+  x = lid == 0 ? binary_op(x, init) : x;
+  auto result_inclusive = sg_inclusive_scan(x, binary_op);
+  auto result = __builtin_bit_cast(
+      T, sg_shift_right(__builtin_bit_cast(typename integer_type<T>::type, result_inclusive), 1));
+  result = lid % subgroup_size == 0 ? init : result;
+  return result;
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp
new file mode 100644
index 000000000..ee0a2945c
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp
@@ -0,0 +1,75 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_DETAIL_SHUFFLE_BUILTINS_HPP
+#define HIPSYCL_SSCP_DETAIL_SHUFFLE_BUILTINS_HPP
+
+#include "../builtin_config.hpp"
+#include "../shuffle.hpp"
+
+namespace hipsycl::libkernel::sscp {
+
+template <typename T> inline T sg_select(T, __acpp_int32) = delete;
+
+template <> inline __acpp_int8 sg_select<__acpp_int8>(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i8(value, id);
+}
+
+template <> inline __acpp_int16 sg_select<__acpp_int16>(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i16(value, id);
+}
+
+template <> inline __acpp_int32 sg_select<__acpp_int32>(__acpp_int32 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+template <> inline __acpp_int64 sg_select<__acpp_int64>(__acpp_int64 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i64(value, id);
+}
+
+template <typename T> T inline sg_shift_left(T, __acpp_int32) = delete;
+
+template <> __acpp_int8 inline sg_shift_left<__acpp_int8>(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i8(value, id);
+}
+
+template <> __acpp_int16 inline sg_shift_left<__acpp_int16>(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i16(value, id);
+}
+
+template <> __acpp_int32 inline sg_shift_left<__acpp_int32>(__acpp_int32 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i32(value, id);
+}
+
+template <> __acpp_int64 inline sg_shift_left<__acpp_int64>(__acpp_int64 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shl_i64(value, id);
+}
+
+template <typename T> T inline sg_shift_right(T, __acpp_int32) = delete;
+
+template <> __acpp_int8 inline sg_shift_right<__acpp_int8>(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i8(value, id);
+}
+
+template <> __acpp_int16 inline sg_shift_right<__acpp_int16>(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i16(value, id);
+}
+
+template <> __acpp_int32 inline sg_shift_right<__acpp_int32>(__acpp_int32 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i32(value, id);
+}
+
+template <> __acpp_int64 inline sg_shift_right<__acpp_int64>(__acpp_int64 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_shr_i64(value, id);
+}
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
new file mode 100644
index 000000000..d90c195c6
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/detail/utils.hpp
@@ -0,0 +1,90 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-
+#ifndef HIPSYCL_SSCP_UTILS_BUILTINS_HPP
+#define HIPSYCL_SSCP_UTILS_BUILTINS_HPP
+
+#include "../builtin_config.hpp"
+
+#define ACPP_SHMEM_ATTRIBUTE                                                                       \
+  static __attribute__((loader_uninitialized)) __attribute__((address_space(3)))
+
+namespace hipsycl::libkernel::sscp {
+
+struct plus {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs + rhs; }
+};
+
+struct min {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs < rhs ? lhs : rhs; }
+};
+
+struct max {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs < rhs ? rhs : lhs; }
+};
+
+struct multiply {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs * rhs; }
+};
+
+struct bit_and {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs & rhs; }
+};
+
+struct bit_or {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs | rhs; }
+};
+
+struct bit_xor {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs ^ rhs; }
+};
+
+struct logical_and {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs and rhs; }
+};
+
+struct logical_or {
+  template <typename T> T operator()(T lhs, T rhs) { return lhs or rhs; }
+};
+
+template <__acpp_sscp_algorithm_op op> struct get_op {};
+
+#define MAP_SSCP_ALGORITHM_OP(sscp_algo_op, impl)                                                  \
+  template <> struct get_op<sscp_algo_op> { using type = impl; };
+
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::plus, plus)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::multiply, multiply)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::min, min)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::max, max)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::bit_and, bit_and)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::bit_or, bit_or)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::bit_xor, bit_xor)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::logical_and, logical_and)
+MAP_SSCP_ALGORITHM_OP(__acpp_sscp_algorithm_op::logical_or, logical_or)
+
+#undef MAP_SSCP_ALGORITHM_OP
+
+template <typename T> struct integer_type { using type = T; };
+
+template <> struct integer_type<__acpp_f32> { using type = __acpp_int32; };
+
+template <> struct integer_type<__acpp_f64> { using type = __acpp_int64; };
+
+template <> struct integer_type<__acpp_uint8> { using type = __acpp_int8; };
+
+template <> struct integer_type<__acpp_uint16> { using type = __acpp_int16; };
+
+template <> struct integer_type<__acpp_uint32> { using type = __acpp_int32; };
+
+template <> struct integer_type<__acpp_uint64> { using type = __acpp_int64; };
+
+} // namespace hipsycl::libkernel::sscp
+
+#endif
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp
index 1f1658262..c3653402e 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp
@@ -8,12 +8,10 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include "builtin_config.hpp"
-#include "hipSYCL/sycl/libkernel/detail/half_representation.hpp"
-
 #ifndef HIPSYCL_SSCP_REDUCTION_BUILTINS_HPP
 #define HIPSYCL_SSCP_REDUCTION_BUILTINS_HPP
 
+#include "builtin_config.hpp"
 
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
@@ -48,8 +46,6 @@ __acpp_f32 __acpp_sscp_work_group_reduce_f32(__acpp_sscp_algorithm_op op, __acpp
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_f64 __acpp_sscp_work_group_reduce_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
 
-
-
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
 
@@ -83,5 +79,4 @@ __acpp_f32 __acpp_sscp_sub_group_reduce_f32(__acpp_sscp_algorithm_op op, __acpp_
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_f64 __acpp_sscp_sub_group_reduce_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
 
-
 #endif
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp
new file mode 100644
index 000000000..1f74221f8
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp
@@ -0,0 +1,104 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_SCAN_EXCLUSIVE_BUILTINS_HPP
+#define HIPSYCL_SSCP_SCAN_EXCLUSIVE_BUILTINS_HPP
+
+#include "builtin_config.hpp"
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_work_group_exclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x,
+                                                     __acpp_int8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_work_group_exclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x,
+                                                       __acpp_int16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_work_group_exclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x,
+                                                       __acpp_int32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_work_group_exclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x,
+                                                       __acpp_int64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_work_group_exclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x,
+                                                      __acpp_uint8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_work_group_exclusive_scan_u16(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint16 x, __acpp_uint16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_work_group_exclusive_scan_u32(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint32 x, __acpp_uint32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_work_group_exclusive_scan_u64(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint64 x, __acpp_uint64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_work_group_exclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x,
+                                                     __acpp_f16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_work_group_exclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x,
+                                                     __acpp_f32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_work_group_exclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x,
+                                                     __acpp_f64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_exclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x,
+                                                    __acpp_int8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_exclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x,
+                                                      __acpp_int16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_exclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x,
+                                                      __acpp_int32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_exclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x,
+                                                      __acpp_int64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_sub_group_exclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x,
+                                                     __acpp_uint8 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_sub_group_exclusive_scan_u16(__acpp_sscp_algorithm_op op, __acpp_uint16 x,
+                                                       __acpp_uint16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_sub_group_exclusive_scan_u32(__acpp_sscp_algorithm_op op, __acpp_uint32 x,
+                                                       __acpp_uint32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_sub_group_exclusive_scan_u64(__acpp_sscp_algorithm_op op, __acpp_uint64 x,
+                                                       __acpp_uint64 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_sub_group_exclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x,
+                                                    __acpp_f16 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_sub_group_exclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x,
+                                                    __acpp_f32 init);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_sub_group_exclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x,
+                                                    __acpp_f64 init);
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp
new file mode 100644
index 000000000..9f443fead
--- /dev/null
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp
@@ -0,0 +1,88 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#ifndef HIPSYCL_SSCP_SCAN_INCLUSIVE_BUILTINS_HPP
+#define HIPSYCL_SSCP_SCAN_INCLUSIVE_BUILTINS_HPP
+
+#include "builtin_config.hpp"
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_work_group_inclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_work_group_inclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_work_group_inclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_work_group_inclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_work_group_inclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_work_group_inclusive_scan_u16(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_work_group_inclusive_scan_u32(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_work_group_inclusive_scan_u64(__acpp_sscp_algorithm_op op,
+                                                        __acpp_uint64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_work_group_inclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_work_group_inclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_work_group_inclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_inclusive_scan_i8(__acpp_sscp_algorithm_op op, __acpp_int8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_inclusive_scan_i16(__acpp_sscp_algorithm_op op, __acpp_int16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_inclusive_scan_i32(__acpp_sscp_algorithm_op op, __acpp_int32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_inclusive_scan_i64(__acpp_sscp_algorithm_op op, __acpp_int64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint8 __acpp_sscp_sub_group_inclusive_scan_u8(__acpp_sscp_algorithm_op op, __acpp_uint8 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint16 __acpp_sscp_sub_group_inclusive_scan_u16(__acpp_sscp_algorithm_op op,
+                                                       __acpp_uint16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint32 __acpp_sscp_sub_group_inclusive_scan_u32(__acpp_sscp_algorithm_op op,
+                                                       __acpp_uint32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_uint64 __acpp_sscp_sub_group_inclusive_scan_u64(__acpp_sscp_algorithm_op op,
+                                                       __acpp_uint64 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f16 __acpp_sscp_sub_group_inclusive_scan_f16(__acpp_sscp_algorithm_op op, __acpp_f16 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f32 __acpp_sscp_sub_group_inclusive_scan_f32(__acpp_sscp_algorithm_op op, __acpp_f32 x);
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_f64 __acpp_sscp_sub_group_inclusive_scan_f64(__acpp_sscp_algorithm_op op, __acpp_f64 x);
+
+#endif
\ No newline at end of file
diff --git a/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp b/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp
index 66deaa2db..d343a09df 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp
@@ -8,11 +8,11 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include "builtin_config.hpp"
-
 #ifndef HIPSYCL_SSCP_SHUFFLE_BUILTINS_HPP
 #define HIPSYCL_SSCP_SHUFFLE_BUILTINS_HPP
 
+#include "builtin_config.hpp"
+
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value,
                                          __acpp_uint32 delta);
@@ -127,7 +127,6 @@ HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value,
                                               __acpp_int32 id);
 
-
 HIPSYCL_SSCP_CONVERGENT_BUILTIN
 __acpp_int8 __acpp_sscp_work_group_select_i8(__acpp_int8 value,
                                              __acpp_int32 id);
diff --git a/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp b/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp
index 6c33126f7..57dedb02a 100644
--- a/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp
+++ b/include/hipSYCL/sycl/libkernel/sscp/group_functions.hpp
@@ -34,6 +34,8 @@
 #include "builtins/broadcast.hpp"
 #include "builtins/collpredicate.hpp"
 #include "builtins/reduction.hpp"
+#include "builtins/scan_exclusive.hpp"
+#include "builtins/scan_inclusive.hpp"
 #include "builtins/shuffle.hpp"
 
 namespace hipsycl {
@@ -68,14 +70,11 @@ __acpp_group_barrier(sub_group g,
 // broadcast
 
 template <int Dim, typename T>
-HIPSYCL_BUILTIN 
-T __acpp_group_broadcast(
-    group<Dim> g, T x,
-    typename group<Dim>::linear_id_type local_linear_id = 0) {
+HIPSYCL_BUILTIN std::enable_if_t<sizeof(T) <= 8, T>
+__acpp_group_broadcast(group<Dim> g, T x, typename group<Dim>::linear_id_type local_linear_id = 0) {
 
-  
   if constexpr(sizeof(T) == 1) {
-    return maybe_bit_cast<T>(__acpp_sscp_work_group_broadcast_i8(
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_broadcast_i8(    
         static_cast<__acpp_int32>(local_linear_id),
         maybe_bit_cast<__acpp_int8>(x)));
   } else if constexpr(sizeof(T) == 2) {
@@ -93,18 +92,9 @@ T __acpp_group_broadcast(
   }
 }
 
-template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_group_broadcast(
-    group<Dim> g, T x, typename group<Dim>::id_type local_id) {
-
-  const auto sender_lid = linear_id<g.dimensions>::get(
-      local_id, g.get_local_range());
-  return __acpp_group_broadcast(g, x, static_cast<int>(sender_lid));
-}
-
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_group_broadcast(
-    sub_group g, T x, typename sub_group::linear_id_type local_linear_id = 0) {
+HIPSYCL_BUILTIN std::enable_if_t<sizeof(T) <= 8, T>
+__acpp_group_broadcast(sub_group g, T x, typename sub_group::linear_id_type local_linear_id = 0) {
 
   // Song recommendation: Leaves' Eyes - Angel and the Ghost
   //
@@ -138,11 +128,12 @@ T __acpp_group_broadcast(sub_group g, T x,
   return __acpp_group_broadcast(g, x, static_cast<int>(local_id[0]));
 }
 
-template <class Group, typename T, int N>
+
+template<typename T, int N, class Group>
 HIPSYCL_BUILTIN
-std::enable_if_t<(sizeof(vec<T, N>) > 8), vec<T,N>> 
+std::enable_if_t<(sizeof(vec<T,N>) > 8), vec<T,N>>
 __acpp_group_broadcast(
-    Group g, vec<T, N> x,
+    Group g, vec<T,N> x,
     typename Group::linear_id_type local_linear_id = 0) {
   vec<T, N> result;
   for (int i = 0; i < N; ++i) {
@@ -151,11 +142,11 @@ __acpp_group_broadcast(
   return result;
 }
 
-template <class Group, typename T, int N>
+template<class Group, typename T, int N>
 HIPSYCL_BUILTIN
-std::enable_if_t<(sizeof(marray<T, N>) > 8), marray<T,N>>
+std::enable_if_t<(sizeof(marray<T,N>) > 8), marray<T,N>>
 __acpp_group_broadcast(
-    Group g, marray<T, N> x,
+    Group g, marray<T,N> x,
     typename Group::linear_id_type local_linear_id = 0) {
   marray<T, N> result;
   for (int i = 0; i < N; ++i) {
@@ -164,6 +155,14 @@ __acpp_group_broadcast(
   return result;
 }
 
+template <int Dim, typename T>
+HIPSYCL_BUILTIN T __acpp_group_broadcast(
+    group<Dim> g, T x, typename group<Dim>::id_type local_id) {
+
+  const auto sender_lid = linear_id<g.dimensions>::get(
+      local_id, g.get_local_range());
+  return __acpp_group_broadcast(g, x, static_cast<int>(sender_lid));
+}
 
 // any_of
 
@@ -334,10 +333,10 @@ HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::bit_or, T{0})
 HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::bit_xor, T{0})
 HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::logical_and, T{1})
 HIPSYCL_SSCP_MAP_GROUP_BINARY_IDENTITY(__acpp_sscp_algorithm_op::logical_or, T{0})
-
+// ---- subgroup
 template <
     typename T, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T>), int> = 0>
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
 HIPSYCL_BUILTIN T __acpp_reduce_over_group(sub_group g, T x,
                                               BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
@@ -361,7 +360,7 @@ HIPSYCL_BUILTIN T __acpp_reduce_over_group(sub_group g, T x,
 
 template <
     typename T, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T>), int> = 0>
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
 HIPSYCL_BUILTIN T __acpp_reduce_over_group(sub_group g, T x,
                                               BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
@@ -424,12 +423,12 @@ marray<T,N> __acpp_reduce_over_group(sub_group g, marray<T,N> x, BinaryOperation
   return result;
 }
 
+// End of subgroup algos
 
 template <
     typename T, int Dim, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T>), int> = 0>
-HIPSYCL_BUILTIN
-T __acpp_reduce_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_reduce_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_work_group_reduce_i8(
         sscp_binary_operation_v<BinaryOperation>,
@@ -451,7 +450,7 @@ T __acpp_reduce_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
 
 template <
     typename T, int Dim, typename BinaryOperation,
-    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T>), int> = 0>
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
 HIPSYCL_BUILTIN T __acpp_reduce_over_group(group<Dim> g, T x,
                                               BinaryOperation binary_op) {
   if constexpr(sizeof(T) == 1) {
@@ -537,8 +536,7 @@ __acpp_joint_reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
 
   using type = decltype(*first);
 
-  auto local = sscp_binary_operation_identity<
-      std::decay_t<type>, sscp_binary_operation_v<BinaryOperation>>::get();
+  auto local = sscp_binary_operation_identity<std::decay_t<type>, sscp_binary_operation_v<BinaryOperation>>::get();
   if(start_ptr < last)
     local = *start_ptr;
   
@@ -556,74 +554,442 @@ T __acpp_joint_reduce(Group g, Ptr first, Ptr last, T init,
   return binary_op(__acpp_joint_reduce(g, first, last, binary_op), init);
 }
 
-// exclusive_scan
 
+// subgroup inclusive_scan
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x)));
+  }
+}
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_inclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x)));
+  }
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_inclusive_scan_over_group(sub_group g, half x,
+                                                      BinaryOperation binary_op) {
+  return detail::create_half(__acpp_sscp_sub_group_inclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x)));
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_inclusive_scan_over_group(sub_group g, float x,
+                                                       BinaryOperation binary_op) {
+  return __acpp_sscp_sub_group_inclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x);
+}
 
-template <int Dim, typename V, typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(
-    group<Dim> g, V x, T init, BinaryOperation binary_op);
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN double __acpp_inclusive_scan_over_group(sub_group g, double x,
+                                                        BinaryOperation binary_op) {
+  return __acpp_sscp_sub_group_inclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x);
+}
 
+// group inclusive scan
 
-template <typename V, typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(
-    sub_group g, V x, T init, BinaryOperation binary_op);
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x)));
+  }
+}
 
-template <typename Group, typename T, typename BinaryOperation,
-          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN T
-__acpp_exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op);
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x)));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x)));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x)));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_inclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x)));
+  }
+}
 
-template <typename Group, typename InPtr, typename OutPtr, typename T,
-          typename BinaryOperation,
-          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               T init, BinaryOperation binary_op);
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_inclusive_scan_over_group(group<Dim> g, half x,
+                                                      BinaryOperation binary_op) {
+  return detail::create_half(__acpp_sscp_work_group_inclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x)));
+}
 
-template <typename Group, typename InPtr, typename OutPtr,
-          typename BinaryOperation,
-          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               BinaryOperation binary_op);
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_inclusive_scan_over_group(group<Dim> g, float x,
+                                                       BinaryOperation binary_op) {
+  return __acpp_sscp_work_group_inclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x);
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN double __acpp_inclusive_scan_over_group(group<Dim> g, double x,
+                                                        BinaryOperation binary_op) {
+  return __acpp_sscp_work_group_inclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x);
+}
 
-// inclusive_scan
+template <typename Group, typename T, int N, typename BinaryOperation>
+HIPSYCL_BUILTIN vec<T, N> __acpp_inclusive_scan_over_group(Group g, vec<T, N> x,
+                                                           BinaryOperation binary_op) {
+  vec<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_inclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
 
-template <typename Group, typename InPtr, typename OutPtr, typename T,
-          typename BinaryOperation,
+template <typename Group, typename T, int N, typename BinaryOperation>
+HIPSYCL_BUILTIN marray<T, N> __acpp_inclusive_scan_over_group(Group g, marray<T, N> x,
+                                                              BinaryOperation binary_op) {
+  marray<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_inclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
+
+template <class Group, typename V, typename T, typename BinaryOperation>
+HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(Group g, V x, T init,
+                                                   BinaryOperation binary_op) {
+  const size_t lid = g.get_local_linear_id();
+  x = lid == 0 ? binary_op(init, x) : x;
+  __acpp_group_barrier(g);
+  x = __acpp_inclusive_scan_over_group(g, x, binary_op);
+  __acpp_group_barrier(g);
+  return x;
+}
+
+template <typename Group, typename InPtr, typename OutPtr, typename BinaryOperation,
           std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               BinaryOperation binary_op, T init);
+HIPSYCL_BUILTIN OutPtr __acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   BinaryOperation binary_op) {
+  const size_t lrange = g.get_local_range().size();
+  const size_t num_elements = last - first;
+  const size_t lid = g.get_local_linear_id();
+  using value_type = std::remove_reference_t<decltype(*first)>;
+
+  if (num_elements == 0)
+    return result;
+
+  if (num_elements == 1) {
+    *result = *first;
+    return result;
+  }
+
+  // Ptr start_ptr = first + lid;
+  using type = decltype(*first);
+  auto identity = sscp_binary_operation_identity<std::decay_t<type>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  size_t segment = 0;
+  size_t num_segments = (num_elements + lrange - 1) / lrange;
+
+  // for (Ptr p = start_ptr + lrange; p < last; p += lrange){
+  for (size_t segment = 0; segment < num_segments; segment++) {
+    size_t element_idx = segment * lrange + lid;
+    auto local_element = element_idx < num_elements ? first[element_idx] : identity;
+    auto segment_result = __acpp_inclusive_scan_over_group(g, local_element, binary_op);
+    if (element_idx < num_elements) {
+      result[element_idx] = segment_result;
+    }
+    __acpp_group_barrier(g);
+
+    if (segment > 0) {
+      auto update_value = result[segment * lrange - 1];
+      if (element_idx < num_elements) {
+        result[element_idx] = binary_op(update_value, result[element_idx]);
+      }
+    }
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
 
-template <typename Group, typename InPtr, typename OutPtr,
-          typename BinaryOperation,
+template <typename Group, typename InPtr, typename OutPtr, typename T, typename BinaryOperation,
           std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN OutPtr
-__acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
-                               BinaryOperation binary_op);
+HIPSYCL_BUILTIN OutPtr __acpp_joint_inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   BinaryOperation binary_op, T init) {
 
-template <int Dim, typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN
-T __acpp_inclusive_scan_over_group(
-    group<Dim> g, T x, BinaryOperation binary_op);
+  const size_t lrange = g.get_local_range().size();
+  const size_t num_elements = last - first;
+  const size_t lid = g.get_local_linear_id();
+
+  if (lid == 0 && num_elements > 0) {
+    first[0] = binary_op(first[0], init);
+  }
+  __acpp_group_barrier(g);
+  OutPtr updated_result = __acpp_joint_inclusive_scan(g, first, last, result, binary_op);
+  __acpp_group_barrier(g);
+  return updated_result;
+}
+
+// exclusive_scan -- subgroup
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x), identity));
+  }
+}
+
+template <
+    typename T, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(sub_group g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_sub_group_exclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x), identity));
+  }
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_exclusive_scan_over_group(sub_group g, half x,
+                                                      BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<half>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return detail::create_half(__acpp_sscp_sub_group_exclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x), identity));
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_exclusive_scan_over_group(sub_group g, float x,
+                                                       BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<float>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_sub_group_exclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x,
+                                                  identity);
+}
+
+template <typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_exclusive_scan_over_group(sub_group g, double x,
+                                                       BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<double>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_sub_group_exclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x,
+                                                  identity);
+}
+
+// // exclusive scan group
+
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_i64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_int64>(x), identity));
+  }
+}
+
+template <
+    typename T, int Dim, typename BinaryOperation,
+    std::enable_if_t<(std::is_integral_v<T> && !std::is_signed_v<T> && sizeof(T) <= 8), int> = 0>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(group<Dim> g, T x, BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<T>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if constexpr (sizeof(T) == 1) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u8(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint8>(x), identity));
+  } else if constexpr (sizeof(T) == 2) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u16(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint16>(x), identity));
+  } else if constexpr (sizeof(T) == 4) {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u32(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint32>(x), identity));
+  } else {
+    return maybe_bit_cast<T>(__acpp_sscp_work_group_exclusive_scan_u64(
+        sscp_binary_operation_v<BinaryOperation>, maybe_bit_cast<__acpp_uint64>(x), identity));
+  }
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN half __acpp_exclusive_scan_over_group(group<Dim> g, half x,
+                                                      BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<half>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return detail::create_half(__acpp_sscp_work_group_exclusive_scan_f16(
+      sscp_binary_operation_v<BinaryOperation>, detail::get_half_storage(x), identity));
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN float __acpp_exclusive_scan_over_group(group<Dim> g, float x,
+                                                       BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<float>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_work_group_exclusive_scan_f32(sscp_binary_operation_v<BinaryOperation>, x,
+                                                   identity);
+}
+
+template <int Dim, typename BinaryOperation>
+HIPSYCL_BUILTIN double __acpp_exclusive_scan_over_group(group<Dim> g, double x,
+                                                        BinaryOperation binary_op) {
+  auto identity = sscp_binary_operation_identity<std::decay_t<double>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  return __acpp_sscp_work_group_exclusive_scan_f64(sscp_binary_operation_v<BinaryOperation>, x,
+                                                   identity);
+}
+
+template <typename T, int N, typename BinaryOperation, class Group>
+HIPSYCL_BUILTIN vec<T, N> __acpp_exclusive_scan_over_group(Group g, vec<T, N> x,
+                                                           BinaryOperation binary_op) {
+  vec<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_exclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
+
+template <typename T, int N, typename BinaryOperation, class Group>
+HIPSYCL_BUILTIN marray<T, N> __acpp_exclusive_scan_over_group(Group g, marray<T, N> x,
+                                                              BinaryOperation binary_op) {
+  marray<T, N> result;
+  for (int i = 0; i < N; ++i) {
+    result[i] = __acpp_exclusive_scan_over_group(g, x[i], binary_op);
+    __acpp_group_barrier(g);
+  }
+  return result;
+}
 
-template <typename T, typename BinaryOperation>
-HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(
-    sub_group g, T x, BinaryOperation binary_op);
+template <class Group, typename V, typename T, typename BinaryOperation>
+HIPSYCL_BUILTIN T __acpp_exclusive_scan_over_group(Group g, V x, T init,
+                                                   BinaryOperation binary_op) {
+  const size_t lid = g.get_local_linear_id();
+  auto identity = sscp_binary_operation_identity<std::decay_t<V>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  x = lid == 0 ? binary_op(init, x) : x;
+  __acpp_group_barrier(g);
+  x = __acpp_exclusive_scan_over_group(g, x, binary_op);
+  __acpp_group_barrier(g);
+  if (lid == 0) {
+    x = init;
+  }
+  return x;
+}
 
-template <typename Group, typename V, typename T, typename BinaryOperation,
+template <typename Group, typename InPtr, typename OutPtr, typename BinaryOperation,
           std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
-HIPSYCL_BUILTIN T __acpp_inclusive_scan_over_group(
-    Group g, V x, T init, BinaryOperation binary_op) {
-  auto scan = __acpp_inclusive_scan_over_group(g, T{x}, binary_op);
-  return binary_op(scan, init);
+HIPSYCL_BUILTIN OutPtr __acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   BinaryOperation binary_op) {
+  const size_t lid = g.get_local_linear_id();
+  __acpp_joint_inclusive_scan(g, first, last - 1, result + 1, binary_op);
+  __acpp_group_barrier(g);
+  using type = decltype(*first);
+  auto identity = sscp_binary_operation_identity<std::decay_t<type>,
+                                                 sscp_binary_operation_v<BinaryOperation>>::get();
+  if (lid == 0) {
+    result[0] = identity;
+  }
+  __acpp_group_barrier(g);
+
+  return result;
+}
+
+template <typename Group, typename InPtr, typename OutPtr, typename T, typename BinaryOperation,
+          std::enable_if_t<is_group_v<std::decay_t<Group>>, bool> = true>
+HIPSYCL_BUILTIN OutPtr __acpp_joint_exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+                                                   T init, BinaryOperation binary_op) {
+
+  const size_t lrange = g.get_local_range().size();
+  const size_t num_elements = last - first;
+  const size_t lid = g.get_local_linear_id();
+  __acpp_group_barrier(g);
+  if (lid == 0 && num_elements > 0) {
+    first[0] = binary_op(first[0], init);
+    result[0] = init;
+  }
+  __acpp_group_barrier(g);
+  OutPtr updated_result = __acpp_joint_inclusive_scan(g, first, last - 1, result + 1, binary_op);
+  __acpp_group_barrier(g);
+  return updated_result;
 }
 
 // shift_left
 template <int Dim, typename T>
 HIPSYCL_BUILTIN
-T __acpp_shift_group_left(
+std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_left(
     group<Dim> g, T x, typename group<Dim>::linear_id_type delta = 1) {
 
   if constexpr(sizeof(T) == 1) {
@@ -642,7 +1008,7 @@ T __acpp_shift_group_left(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_shift_group_left(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_left(
     sub_group g, T x, typename sub_group::linear_id_type delta = 1) {
 
   if constexpr(sizeof(T) == 1) {
@@ -667,6 +1033,7 @@ __acpp_shift_group_left(Group g, vec<T,N> x, typename Group::linear_id_type delt
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_left(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -678,13 +1045,14 @@ __acpp_shift_group_left(Group g, marray<T,N> x, typename Group::linear_id_type d
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_left(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
 
 // shift_right
 template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_shift_group_right(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_right(
     group<Dim> g, T x, typename group<Dim>::linear_id_type delta = 1) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_work_group_shr_i8(
@@ -702,7 +1070,7 @@ HIPSYCL_BUILTIN T __acpp_shift_group_right(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_shift_group_right(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_shift_group_right(
     sub_group g, T x, typename sub_group::linear_id_type delta = 1) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_sub_group_shr_i8(
@@ -727,6 +1095,7 @@ __acpp_shift_group_right(Group g, vec<T,N> x, typename Group::linear_id_type del
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_right(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -738,13 +1107,14 @@ __acpp_shift_group_right(Group g, marray<T,N> x, typename Group::linear_id_type
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_shift_group_right(g, x[i], delta);
+    __acpp_group_barrier(g);
   }
   return result;
 }
 
 // permute_group_by_xor
 template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_permute_group_by_xor(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_permute_group_by_xor(
     group<Dim> g, T x, typename group<Dim>::linear_id_type mask) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_work_group_permute_i8(
@@ -762,7 +1132,7 @@ HIPSYCL_BUILTIN T __acpp_permute_group_by_xor(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_permute_group_by_xor(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_permute_group_by_xor(
     sub_group g, T x, typename sub_group::linear_id_type mask) {
   if constexpr(sizeof(T) == 1) {
     return maybe_bit_cast<T>(__acpp_sscp_sub_group_permute_i8(
@@ -786,6 +1156,7 @@ __acpp_permute_group_by_xor(Group g, vec<T,N> x, typename Group::linear_id_type
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_permute_group_by_xor(g, x[i], mask);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -797,13 +1168,14 @@ __acpp_permute_group_by_xor(Group g, marray<T,N> x, typename Group::linear_id_ty
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_permute_group_by_xor(g, x[i], mask);
+    __acpp_group_barrier(g);
   }
   return result;
 }
 
 // select_from_group
 template <int Dim, typename T>
-HIPSYCL_BUILTIN T __acpp_select_from_group(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_select_from_group(
     group<Dim> g, T x, typename group<Dim>::id_type remote_local_id) {
 
   __acpp_int32 linear_id = static_cast<__acpp_int32>(
@@ -825,7 +1197,7 @@ HIPSYCL_BUILTIN T __acpp_select_from_group(
 }
 
 template <typename T>
-HIPSYCL_BUILTIN T __acpp_select_from_group(
+HIPSYCL_BUILTIN std::enable_if_t<(sizeof(T) <= 8), T> __acpp_select_from_group(
     sub_group g, T x, typename sub_group::id_type remote_local_id) {
 
   __acpp_int32 linear_id = static_cast<__acpp_int32>(remote_local_id[0]);
@@ -853,6 +1225,7 @@ __acpp_select_from_group(Group g, vec<T,N> x, typename Group::id_type remote_loc
   vec<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_select_from_group(g, x[i], remote_local_id);
+    __acpp_group_barrier(g);
   }
   return result;
 }
@@ -864,6 +1237,7 @@ __acpp_select_from_group(Group g, marray<T,N> x, typename Group::id_type remote_
   marray<T,N> result;
   for(int i = 0; i < N; ++i) {
     result[i] = __acpp_select_from_group(g, x[i], remote_local_id);
+    __acpp_group_barrier(g);
   }
   return result;
 }
diff --git a/include/hipSYCL/sycl/platform.hpp b/include/hipSYCL/sycl/platform.hpp
index 934b2b8fb..4a61302a1 100644
--- a/include/hipSYCL/sycl/platform.hpp
+++ b/include/hipSYCL/sycl/platform.hpp
@@ -12,6 +12,7 @@
 #define HIPSYCL_PLATFORM_HPP
 
 #include <vector>
+#include <string>
 
 #include "hipSYCL/runtime/application.hpp"
 #include "hipSYCL/runtime/backend.hpp"
@@ -22,6 +23,7 @@
 #include "info/info.hpp"
 #include "version.hpp"
 
+
 namespace hipsycl {
 namespace sycl {
 
@@ -32,13 +34,25 @@ class platform {
 public:
   platform() : _platform{detail::get_host_device().get_backend(), 0} {}
   
-  platform(rt::backend_id backend)
-      : _platform{backend, 0} {}
+  platform(rt::platform_id platform)
+  : _platform{platform} {}
+
+  platform(rt::backend_id backend, std::size_t platform_index)
+      : _platform{backend, platform_index} {}
 
   template<class DeviceSelector>
   explicit platform(const DeviceSelector &deviceSelector) {
     auto dev = detail::select_devices(deviceSelector)[0];
-    this->_platform = rt::platform_id{dev._device_id};
+    
+    rt::backend *b =
+        _requires_runtime.get()->backends().get(dev.get_backend());
+    std::size_t platform_index =
+        b->get_hardware_manager()
+            ->get_device(dev.AdaptiveCpp_device_id().get_id())
+            ->get_platform_index();
+
+    this->_platform =
+        rt::platform_id{dev.get_backend(), static_cast<int>(platform_index)};
   }
 
 
@@ -54,12 +68,15 @@ class platform {
       bool is_gpu = b->get_hardware_manager()->get_device(dev)->is_gpu();
 
       bool include_device = false;
-      if (type == info::device_type::all ||
-          (type == info::device_type::accelerator && is_gpu) ||
-          (type == info::device_type::gpu && is_gpu) ||
-          (type == info::device_type::host && is_cpu) ||
-          (type == info::device_type::cpu && is_cpu)) {
-        include_device = true;
+      if (b->get_hardware_manager()->get_device(dev)->get_platform_index() ==
+          _platform.get_platform()) {
+        if (type == info::device_type::all ||
+            (type == info::device_type::accelerator && is_gpu) ||
+            (type == info::device_type::gpu && is_gpu) ||
+            (type == info::device_type::host && is_cpu) ||
+            (type == info::device_type::cpu && is_cpu)) {
+          include_device = true;
+        }
       }
 
       if (include_device)
@@ -102,7 +119,10 @@ class platform {
     rt::runtime_keep_alive_token requires_runtime;
 
     requires_runtime.get()->backends().for_each_backend([&](rt::backend *b) {
-      result.push_back(platform{b->get_unique_backend_id()});
+      for (std::size_t i = 0;
+           i < b->get_hardware_manager()->get_num_platforms(); ++i) {
+        result.push_back(platform{b->get_unique_backend_id(), i});
+      }
     });
 
     return result;
@@ -126,7 +146,8 @@ class platform {
     return AdaptiveCpp_hash_code();
   }
 
-
+  
+  context khr_get_default_context() const;
 private:
   rt::platform_id _platform;
   rt::runtime_keep_alive_token _requires_runtime;
@@ -147,12 +168,16 @@ HIPSYCL_SPECIALIZE_GET_INFO(platform, version)
 HIPSYCL_SPECIALIZE_GET_INFO(platform, name)
 {
   rt::backend_id b = _platform.get_backend();
-  return _requires_runtime.get()->backends().get(b)->get_name();
+  std::string platform_name =
+      _requires_runtime.get()->backends().get(b)->get_name();
+  platform_name +=
+      " (platform " + std::to_string(_platform.get_platform()) + ")";
+      return platform_name;
 }
 
 HIPSYCL_SPECIALIZE_GET_INFO(platform, vendor)
 {
-  return "The hipSYCL project";
+  return "The AdaptiveCpp project";
 }
 
 HIPSYCL_SPECIALIZE_GET_INFO(platform, extensions)
@@ -161,7 +186,8 @@ HIPSYCL_SPECIALIZE_GET_INFO(platform, extensions)
 }
 
 inline platform device::get_platform() const  {
-  return platform{_device_id.get_backend()};
+  return platform{_device_id.get_backend(),
+                  static_cast<int>(get_rt_device()->get_platform_index())};
 }
 
 }// namespace sycl
diff --git a/include/hipSYCL/sycl/queue.hpp b/include/hipSYCL/sycl/queue.hpp
index 4759c73b4..477978a2b 100644
--- a/include/hipSYCL/sycl/queue.hpp
+++ b/include/hipSYCL/sycl/queue.hpp
@@ -153,6 +153,8 @@ class queue : public detail::property_carrying_object
 
     // Prevents kernel cache from becoming invalid while we have a queue
     std::shared_ptr<rt::kernel_cache> kernel_cache;
+    // For non-emulated in-order queues only
+    std::atomic<bool> has_non_instant_operations = false;
   };
 
   template<typename, int, access::mode, access::target>
@@ -188,11 +190,12 @@ class queue : public detail::property_carrying_object
       : queue{detail::select_devices(deviceSelector), asyncHandler, propList} {}
 
   explicit queue(const device &syclDevice, const property_list &propList = {})
-      : queue{context{syclDevice}, std::vector<device>{syclDevice}, propList} {}
+      : queue{get_default_context(syclDevice), std::vector<device>{syclDevice},
+              propList} {}
 
   explicit queue(const device &syclDevice, const async_handler &asyncHandler,
                  const property_list &propList = {})
-      : queue{context{syclDevice, asyncHandler}, std::vector<device>{syclDevice},
+      : queue{get_default_context(syclDevice), std::vector<device>{syclDevice},
               asyncHandler, propList} {}
 
   template <
@@ -229,10 +232,10 @@ class queue : public detail::property_carrying_object
   explicit queue(const std::vector<device> &devices,
                  const async_handler &handler,
                  const property_list &propList = {})
-      : queue{context{devices, handler}, devices, handler, propList} {}
+      : queue{get_default_context(devices), devices, handler, propList} {}
 
   explicit queue(const std::vector<device>& devices, const property_list& propList = {})
-    : queue{context{devices}, devices, propList} {}
+    : queue{get_default_context(devices), devices, propList} {}
 
   explicit queue(const context &syclContext, const std::vector<device> &devices,
                  const property_list &propList = {})
@@ -355,7 +358,8 @@ class queue : public detail::property_carrying_object
         assert(exec);
         // Need to ensure everything is submitted before waiting on the stream
         // in case we have non-instant operations
-        _impl->requires_runtime.get()->dag().flush_sync();
+        if(_impl->has_non_instant_operations.load(std::memory_order_relaxed))
+          _impl->requires_runtime.get()->dag().flush_sync();
         
         auto err = exec->wait();
         if(!err.is_success()) {
@@ -1020,6 +1024,20 @@ class queue : public detail::property_carrying_object
     return AdaptiveCpp_inorder_executor();
   }
 private:
+  static context get_default_context(const device& dev) {
+    return context{detail::default_context_tag_t{}, dev.get_platform()};
+  }
+
+  static context get_default_context(const std::vector<device> &devices) {
+    if(devices.empty())
+      return context{detail::default_context_tag_t{}};
+    if(devices.size() == 1){
+      return context{detail::default_context_tag_t{}, devices[0].get_platform()};
+    } else {
+      return context{detail::default_context_tag_t{}, devices};
+    }
+  }
+
   template<int Dim>
   void apply_preferred_group_size(const property_list& prop_list, handler& cgh) {
     if(prop_list.has_property<property::command_group::AdaptiveCpp_prefer_group_size<Dim>>()){
@@ -1047,6 +1065,7 @@ class queue : public detail::property_carrying_object
       if(_impl->needs_in_order_emulation) {
         _impl->previous_submission = node;
       } else if(cgh.contains_non_instant_nodes()) {
+        _impl->has_non_instant_operations.store(true, std::memory_order_relaxed);
         // If we have instant submission enabled, non-emulated in-order queue
         // but non-instant tasks, we need to flush the dag, otherwise future instant
         // tasks might not wait on the tasks that have been cached in the dag
diff --git a/include/hipSYCL/sycl/sycl.hpp b/include/hipSYCL/sycl/sycl.hpp
index 10a95c28a..31f269349 100644
--- a/include/hipSYCL/sycl/sycl.hpp
+++ b/include/hipSYCL/sycl/sycl.hpp
@@ -23,8 +23,8 @@
  #undef SYCL_LANGUAGE_VERSION
 #endif
 
-#define CL_SYCL_LANGUAGE_VERSION 202003
-#define SYCL_LANGUAGE_VERSION 202003
+#define CL_SYCL_LANGUAGE_VERSION 202012L
+#define SYCL_LANGUAGE_VERSION 202012L
 #define SYCL_FEATURE_SET_FULL
 
 #include "hipSYCL/glue/persistent_runtime.hpp"
@@ -64,6 +64,7 @@
 #include "version.hpp"
 #include "types.hpp"
 #include "exception.hpp"
+#include "is_device_copyable.hpp"
 #include "device_selector.hpp"
 #include "device.hpp"
 #include "platform.hpp"
@@ -78,6 +79,7 @@
 #include "buffer_explicit_behavior.hpp"
 #include "specialized.hpp"
 #include "jit.hpp"
+#include "detail/namespace_compat.hpp"
 
 // Support SYCL_EXTERNAL for SSCP - we cannot have SYCL_EXTERNAL if accelerated CPU
 // is active at the same time :(
diff --git a/include/hipSYCL/sycl/usm.hpp b/include/hipSYCL/sycl/usm.hpp
index 350869648..26bf67655 100644
--- a/include/hipSYCL/sycl/usm.hpp
+++ b/include/hipSYCL/sycl/usm.hpp
@@ -35,7 +35,8 @@ namespace sycl {
 
 inline void *malloc_device(size_t num_bytes, const device &dev,
                            const context &ctx) {
-  return detail::select_device_allocator(dev)->allocate(0, num_bytes);
+  return rt::allocate_device(detail::select_device_allocator(dev), 0,
+                             num_bytes);
 }
 
 template <typename T>
@@ -55,7 +56,7 @@ T* malloc_device(std::size_t count, const queue &q) {
 
 inline void *aligned_alloc_device(std::size_t alignment, std::size_t num_bytes,
                                   const device &dev, const context &ctx) {
-  return detail::select_device_allocator(dev)->allocate(alignment, num_bytes);
+  return rt::allocate_device(detail::select_device_allocator(dev), alignment, num_bytes);
 }
 
 template <typename T>
@@ -79,7 +80,7 @@ T *aligned_alloc_device(std::size_t alignment, std::size_t count,
 // Restricted USM
 
 inline void *malloc_host(std::size_t num_bytes, const context &ctx) {
-  return detail::select_usm_allocator(ctx)->allocate_optimized_host(0, num_bytes);
+  return rt::allocate_host(detail::select_usm_allocator(ctx), 0, num_bytes);
 }
 
 template <typename T> T *malloc_host(std::size_t count, const context &ctx) {
@@ -96,7 +97,7 @@ template <typename T> T *malloc_host(std::size_t count, const queue &q) {
 
 inline void *malloc_shared(std::size_t num_bytes, const device &dev,
                            const context &ctx) {
-  return detail::select_usm_allocator(ctx, dev)->allocate_usm(num_bytes);
+  return rt::allocate_shared(detail::select_usm_allocator(ctx, dev), num_bytes);
 }
 
 template <typename T>
@@ -114,8 +115,8 @@ template <typename T> T *malloc_shared(std::size_t count, const queue &q) {
 
 inline void *aligned_alloc_host(std::size_t alignment, std::size_t num_bytes,
                                 const context &ctx) {
-  return detail::select_usm_allocator(ctx)->allocate_optimized_host(alignment,
-                                                                    num_bytes);
+  return rt::allocate_host(detail::select_usm_allocator(ctx), alignment,
+                           num_bytes);
 }
 
 template <typename T>
@@ -137,7 +138,7 @@ T *aligned_alloc_host(std::size_t alignment, std::size_t count,
 
 inline void *aligned_alloc_shared(std::size_t alignment, std::size_t num_bytes,
                                   const device &dev, const context &ctx) {
-  return detail::select_usm_allocator(ctx, dev)->allocate_usm(num_bytes);
+  return rt::allocate_shared(detail::select_usm_allocator(ctx, dev), num_bytes);
 }
 
 template <typename T>
@@ -224,7 +225,7 @@ T *aligned_alloc(std::size_t alignment, std::size_t count, const sycl::queue &q,
 }
 
 inline void free(void *ptr, const sycl::context &ctx) {
-  return detail::select_usm_allocator(ctx)->free(ptr);
+  return rt::deallocate(detail::select_usm_allocator(ctx), ptr);
 }
 
 inline void free(void *ptr, const sycl::queue &q) {
diff --git a/include/hipSYCL/sycl/version.hpp b/include/hipSYCL/sycl/version.hpp
index d329026c9..887055395 100644
--- a/include/hipSYCL/sycl/version.hpp
+++ b/include/hipSYCL/sycl/version.hpp
@@ -21,8 +21,9 @@ namespace detail {
 
 static std::string version_string()
 {
+  std::string zero = (ACPP_VERSION_MINOR < 10 ? "0" : "");
   std::string version = std::to_string(ACPP_VERSION_MAJOR)
-      + "." + std::to_string(ACPP_VERSION_MINOR)
+      + "." + zero + std::to_string(ACPP_VERSION_MINOR)
       + "." + std::to_string(ACPP_VERSION_PATCH)
       + std::string(ACPP_VERSION_SUFFIX);
 
diff --git a/install/scripts/README.md b/install/scripts/README.md
deleted file mode 100644
index a1d86d4fd..000000000
--- a/install/scripts/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# hipSYCL installation and packaging scripts
-
-We provide
-* Scripts to install hipSYCL and required LLVM, ROCm and CUDA stacks
-* Repositories for all supported distributions
-* Singularity definition files which allow to create singularity container images with hipSYCL
-* Pre-built singularity containers
-* Scripts to create binary packages of the entire stack for several distributions.
-
-Currently, we support
-* Ubuntu 18.04
-* Ubuntu 20.04
-* CentOS 7
-* Arch Linux
-
-## Installing from repositories
-Installing using the repositories is beneficial because the hipSYCL installation can be kept up to date with regular system updates. We provide stable packages subject to more rigorous testing and nightly packages built from the current development head.
-
-We provide the following packages in both versions:
-
-Base packages: 
-* `hipSYCL-base<-nightly>`
-* `hipSYCL-base-rocm<-nightly>`
-HipSYCL packages:
-* `hipSYCL-omp<-nightly>`
-* `hipSYCL-omp-cuda<-nightly>`
-* `hipSYCL-omp-rocm<-nightly>`
-* `hipSYCL-omp-rocm-cuda<-nightly>`
-Two meta-packages in order to keep consistent with the previous packages:
-* `hipSYCL-full<-nightly>` -> `hipSYCL-omp-rocm-cuda<-nightly>`
-* `hipSYCL<-nightly>` -> `hipSYCL-omp-rocm-cuda<-nightly>`
-
-We require some additional software repos to be enabled (for example, `release-scl` and `epel` for centos 7 ). To make adding these easier, we provide scripts in the `install/scripts/add-hipsycl-repo` for all supported distributions that handles adding these repositories, as well as adding the hipSYCL repo.
-
-## Installing by script
-Note that the installation scripts may require the installation of some packages, depending on your distributions. We recommend first looking at the singularity definition files `*.def` for your distribution and installing everything that is installed there. Afterwards, run
-
-* `sudo sh install-llvm.sh` - basic LLVM/clang stack required by hipSYCL
-* `sudo sh install-cuda.sh` - downloads and installs a compatible CUDA distribution
-* `sudo sh install-rocm.sh` - installs a compatible ROCm stack
-* `sudo sh install-hipsycl.sh` - installs hipSYCL.
-
-Unless you have a massive machine, you can expect this to run for half an eternity, so patience is a prerequisite for this installation approach. The easier way is to use our provided binary packages.
-The installation prefix can be changed using the environment variable `INSTALL_PREFIX` (default is `/opt/hipSYCL`). Note that the `install-hipsycl.sh` script builds hipSYCL with support for both CUDA and ROCm backends by default, which means you need to have both installed. If you wish to disable support for CUDA/ROCm, set the `HIPSYCL_WITH_CUDA` or `HIPSYCL_WITH_ROCM` environment variables to `OFF`.
-
-If you change the `INSTALL_PREFIX` to a directory that is writable by your user, `sudo` is not required.
-
-## Building a singularity container
-We also provide singularity definition files in order to create singularity container images. Building an image consists of building a writable base image and afterwards installing the dependencies and hipsycl into the container
-
-```
-singularity build --fakeroot --sandbox base-ubuntu-18.04.sif base-definitions/base-ubuntu-18.04.def
-```
-for Ubuntu 18.04. Once this image is built, you can start adding the dependencies
-```
-singularity exec hipsycl-ubuntu-18.04.def install-llvm.sh
-singularity exec hipsycl-ubuntu-18.04.def install-rocm.sh
-singularity exec hipsycl-ubuntu-18.04.def install-cuda.sh
-```
-Note that there are two type of installation scripts available at the moment the regular ones located in the `install/scripts/` directory, and scripts that use spack to install the dependencies located in `install/scripts/spack-install/`. The spack install scripts are well tested, therefore we recommend using those for the installation. The regular install scripts might need some changes to work flawlessly.
-
-## Pre-built singularity containers
-
-We provide pre-built singularity images for all supported distributions. The containers are available here: http://repo.urz.uni-heidelberg.de/sycl/singularity/ 
-
-The images are validated by building the hipSYCL unit tests for all supported backends, and running them for OpenMP and CUDA.
-
-Please note that due to legal reasons, the images do not contain the CUDA installation. Please use the `install/scripts/install-cuda.sh` script to install it afterwards. Note that this is only possible in case the container is writable; therefore we recommend installing CUDA by executing the following commands:
-
-```
-singularity shell build --sandbox --fakeroot <container_name>.sif <container_name>
-singularity exec --writable --fakeroot <container_name> bash install/scripts/install-cuda.sh
-```
-
-## Creating packages
-In order to create binary packages for your distribution, you will first have to create container images as described above. Then run (e.g., for Ubuntu):
-```
-cd packaging; singularity exec hipsycl-image.sif sh make-ubuntu-pkg.sh
-```
-This script will generate three packages:
-* `hipSYCL-base` contains the LLVM stack and clang compiler for hipSYCL. This package must always be installed
-* `hipSYCL-rocm` contains a ROCm stack. This must be installed if you wish to target ROCm
-* `hipSYCL` contains the actual hipSYCL libraries, headers and tools
-
-Creating CUDA packages is also possible, but this functionality is separate since we do not distribute CUDA binary packages for legal reasons. In order to create a CUDA package, just run the `make-ubuntu-cuda.sh` (for Ubuntu, analogously for other distributions) script. This script can be run on its own and does not require the building the entire stack including container image.
-Note: If you only intend to install hipSYCL's CUDA stack on a single machine for home use, it may be easier and faster to just install it directly using the install script: Run
-```
-sudo sh install-cuda.sh 
-```
-which will install it directly to `/opt/hipSYCL/cuda` where hipSYCL expects it.
diff --git a/install/scripts/add-hipsycl-repo/archlinux-rolling.sh b/install/scripts/add-hipsycl-repo/archlinux-rolling.sh
deleted file mode 100644
index 580b574a6..000000000
--- a/install/scripts/add-hipsycl-repo/archlinux-rolling.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash 
-set -o xtrace
-
-echo '[hipsycl]' >> /etc/pacman.conf
-echo "Server = http://repo.urz.uni-heidelberg.de/sycl${1}/archlinux/x86_64" >> /etc/pacman.conf
-
-pacman-key --init
-wget -q -O - http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc | pacman-key --add -
-pacman-key --lsign-key E967BA09716F870320089583E68CC4B9B2B75080
-pacman -Sy
-
-
diff --git a/install/scripts/add-hipsycl-repo/centos-7.sh b/install/scripts/add-hipsycl-repo/centos-7.sh
deleted file mode 100644
index 431818c4d..000000000
--- a/install/scripts/add-hipsycl-repo/centos-7.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash 
-yum update -y
-yum install epel-release -y
-yum install -y rpm-build sed wget curl patch 
-yum install centos-release-scl -y
-yum-config-manager --add-repo http://repo.urz.uni-heidelberg.de/sycl$1/rpm/centos7/hipsycl.repo
diff --git a/install/scripts/add-hipsycl-repo/ubuntu-18.04.sh b/install/scripts/add-hipsycl-repo/ubuntu-18.04.sh
deleted file mode 100644
index 2320fdcbb..000000000
--- a/install/scripts/add-hipsycl-repo/ubuntu-18.04.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-add-apt-repository -y ppa:ubuntu-toolchain-r/test
-echo "deb http://repo.urz.uni-heidelberg.de/sycl$1/deb/ ./bionic main" > /etc/apt/sources.list.d/hipsycl.list
-wget -q -O - http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc | apt-key add -
-apt update
-
diff --git a/install/scripts/add-hipsycl-repo/ubuntu-20.04.sh b/install/scripts/add-hipsycl-repo/ubuntu-20.04.sh
deleted file mode 100644
index ce2c66e8c..000000000
--- a/install/scripts/add-hipsycl-repo/ubuntu-20.04.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-export DEBIAN_FRONTEND=noninteractive
-echo "deb http://repo.urz.uni-heidelberg.de/sycl$1/deb/ ./focal main" > /etc/apt/sources.list.d/hipsycl.list
-wget -q -O - http://repo.urz.uni-heidelberg.de/sycl/hipsycl.asc | apt-key add -
-apt update
diff --git a/install/scripts/base-definitions/archlinux-rolling.def b/install/scripts/base-definitions/archlinux-rolling.def
deleted file mode 100644
index 5ca08ad3a..000000000
--- a/install/scripts/base-definitions/archlinux-rolling.def
+++ /dev/null
@@ -1,9 +0,0 @@
-BootStrap: docker
-From: archlinux:base
-
-%setup
-
-%post
-[ "$HIPSYCL_PKG_BUILD_CUDA" = "ON" ] && bash /install-cuda.sh || echo "Not building CUDA"
-pacman -Syu --noconfirm
-pacman -S --noconfirm unzip sed wget git python3 parallel tar perl base-devel cmake curl	
diff --git a/install/scripts/base-definitions/centos-7.def b/install/scripts/base-definitions/centos-7.def
deleted file mode 100644
index 565038c78..000000000
--- a/install/scripts/base-definitions/centos-7.def
+++ /dev/null
@@ -1,22 +0,0 @@
-BootStrap: docker
-From: centos:centos7
-
-%environment
-HIPSYCL_BASE_CC=gcc
-HIPSYCL_BASE_CXX=g++
-. /opt/rh/devtoolset-9/enable
-%setup
-
-%post
-yum update -y
-yum install epel-release -y
-yum install -y rpm-build sed unzip python34 python3 git parallel wget perl perl-Data-Dumper cmake3 curl patch 
-yum install centos-release-scl -y
-yum install devtoolset-9 -y
-yum install lbzip2 -y
-#We neeed proper cmake 
-yum remove cmake
-ln -s /usr/bin/cmake3 /usr/bin/cmake
-#bash /install-cuda.sh
-#bash /install-base-spack.sh
-
diff --git a/install/scripts/base-definitions/ubuntu-18.04.def b/install/scripts/base-definitions/ubuntu-18.04.def
deleted file mode 100644
index aa397b501..000000000
--- a/install/scripts/base-definitions/ubuntu-18.04.def
+++ /dev/null
@@ -1,16 +0,0 @@
-BootStrap: docker
-From: ubuntu:18.04
-
-%setup
-
-%post
-apt update
-apt install -y sed unzip wget gcc g++ git python3 parallel perl perl-modules cmake curl
-apt install -y software-properties-common
-apt install -y software-properties-common
-add-apt-repository -y ppa:ubuntu-toolchain-r/test
-apt -y install g++-9
-
-
-#bash /install-cuda.sh 
-#bash /install-base-spack.sh
diff --git a/install/scripts/hipsycl-archlinux-rolling.def b/install/scripts/hipsycl-archlinux-rolling.def
deleted file mode 100644
index d8f984163..000000000
--- a/install/scripts/hipsycl-archlinux-rolling.def
+++ /dev/null
@@ -1,8 +0,0 @@
-BootStrap: localimage
-From: base-archlinux-rolling.sif
-
-%setup
-cp ./install-hipsycl.sh ${SINGULARITY_ROOTFS}/install-hipsycl.sh
-
-%post
-sh /install-hipsycl.sh
diff --git a/install/scripts/hipsycl-centos-7.def b/install/scripts/hipsycl-centos-7.def
deleted file mode 100644
index 48f81cb40..000000000
--- a/install/scripts/hipsycl-centos-7.def
+++ /dev/null
@@ -1,10 +0,0 @@
-BootStrap: localimage
-From: base-centos-7.sif
-
-%environment
-source /opt/rh/devtoolset-7/enable
-%setup
-cp ./install-hipsycl.sh ${SINGULARITY_ROOTFS}/install-hipsycl.sh
-
-%post
-sh /install-hipsycl.sh
diff --git a/install/scripts/hipsycl-minimal-install.sh b/install/scripts/hipsycl-minimal-install.sh
deleted file mode 100644
index 5c3cbd1ac..000000000
--- a/install/scripts/hipsycl-minimal-install.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-set -e
-
-echo "This will install hipSYCL into the current directory in a VERY minimal configuration:"
-echo "The installation will only support CPU and no LLVM compiler acceleration of SYCL kernels."
-echo "For production use and performance, this may not be ideal, but if you just quickly want to have a SYCL implementation, it might be perfect :-)"
-echo ""
-echo "The only dependencies required are:"
-echo " * Your default system compiler must support C++17 and OpenMP"
-echo " * You need to have installed the boost.context and boost.fiber libraries, including development files (e.g. on Ubuntu, the libboost-all-dev package)."
-echo " * python 3"
-echo " * cmake"
-echo ""
-echo "Make sure these dependencies are satisfied and press enter to continue".
-read ARG
-
-
-rm -rf ./hipsycl-build
-mkdir -p ./hipsycl-build
-git clone https://github.com/illuhad/hipSYCL ./hipsycl-build
-mkdir -p ./hipsycl-build/build
-cd ./hipsycl-build/build
-cmake -DCMAKE_INSTALL_PREFIX=`pwd`/../.. -DWITH_CUDA_BACKEND=OFF -DWITH_ROCM_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF -DWITH_OPENCL_BACKEND=OFF -DACPP_COMPILER_FEATURE_PROFILE=none ..
-make install
diff --git a/install/scripts/hipsycl-ubuntu-18.04.def b/install/scripts/hipsycl-ubuntu-18.04.def
deleted file mode 100644
index 735962192..000000000
--- a/install/scripts/hipsycl-ubuntu-18.04.def
+++ /dev/null
@@ -1,8 +0,0 @@
-BootStrap: localimage
-From: base-ubuntu-18.04.sif
-
-%setup
-cp ./install-hipsycl.sh ${SINGULARITY_ROOTFS}/install-hipsycl.sh
-
-%post
-sh /install-hipsycl.sh
diff --git a/install/scripts/install-base-spack.sh b/install/scripts/install-base-spack.sh
deleted file mode 100644
index b88314a5c..000000000
--- a/install/scripts/install-base-spack.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-set -e
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-git clone https://github.com/spack/spack.git
-export SPACK_ROOT=/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/llvm/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 64|' spack/etc/spack/defaults/config.yaml
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-spack install llvm@$llvm_version libcxx=False 
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/boost/|' spack/etc/spack/defaults/config.yaml
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-spack install boost%clang@$llvm_version
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/rocm/|' spack/etc/spack/defaults/config.yaml
-find . | grep -E "(__pycache__|\.pyc|\.pyo$)" | xargs rm -rf
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-spack install hip%clang@$llvm_version
-
-
-
diff --git a/install/scripts/install-cuda.sh b/install/scripts/install-cuda.sh
deleted file mode 100644
index b742fc5ad..000000000
--- a/install/scripts/install-cuda.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-
-CUDA_INSTALLER_FILENAME=cuda_10.0.130_410.48_linux
-
-set -e
-cd /tmp
-if [ ! -f $CUDA_INSTALLER_FILENAME ]; then
-  wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/$CUDA_INSTALLER_FILENAME
-fi
-sh $CUDA_INSTALLER_FILENAME --override --silent --toolkit --toolkitpath $HIPSYCL_INSTALL_PREFIX/cuda
diff --git a/install/scripts/install-hipsycl.sh b/install/scripts/install-hipsycl.sh
deleted file mode 100644
index 2ca3d9e3e..000000000
--- a/install/scripts/install-hipsycl.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-
-set -e
-HIPSYCL_BUILD_DIR=${HIPSYCL_BUILD_DIR:-/tmp/hipSYCL-installer}
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-develop}
-HIPSYCL_WITH_CUDA=${HIPSYCL_WITH_CUDA:-ON}
-HIPSYCL_WITH_ROCM=${HIPSYCL_WITH_ROCM:-ON}
-HIPSYCL_LLVM_DIR=${HIPSYCL_LLVM_DIR:-/opt/hipSYCL/llvm/lib/}
-
-if [ -d "$HIPSYCL_BUILD_DIR" ]; then
-       read -p  "The build directory already exists, do you want to use $HIPSYCL_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-              echo "Please specify a different directory, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       else
-              echo "Using the exisiting directory"
-       fi
-else
-echo "Cloning hipSYCL"
-git clone --recurse-submodules -b $HIPSYCL_REPO_BRANCH https://github.com/$HIPSYCL_REPO_USER/hipSYCL $HIPSYCL_BUILD_DIR
-
-fi
-
-mkdir -p $HIPSYCL_BUILD_DIR/build
-cd $HIPSYCL_BUILD_DIR/build
-
-cmake \
--DWITH_CPU_BACKEND=ON \
--DWITH_CUDA_BACKEND=$HIPSYCL_WITH_CUDA \
--DWITH_ROCM_BACKEND=$HIPSYCL_WITH_ROCM \
--DLLVM_DIR=$HIPSYCL_LLVM_DIR \
--DROCM_PATH=$HIPSYCL_INSTALL_PREFIX/rocm \
--DCMAKE_INSTALL_PREFIX=$HIPSYCL_INSTALL_PREFIX \
-..
-
-make -j `nproc` install
diff --git a/install/scripts/install-llvm.sh b/install/scripts/install-llvm.sh
deleted file mode 100644
index babb28a52..000000000
--- a/install/scripts/install-llvm.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-1}
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-llvmorg-${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.${HIPSYCL_PKG_LLVM_VERSION_MINOR}.${HIPSYCL_PKG_LLVM_VERSION_PATCH}}
-
-HIPSYCL_PKG_LLVM_VERSION=${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.${HIPSYCL_PKG_LLVM_VERSION_MINOR}.${HIPSYCL_PKG_LLVM_VERSION_PATCH}
-
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-release/9.x}
-HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-HIPSYCL_LLVM_BUILD_DIR=${HIPSYCL_LLVM_BUILD_DIR:-$HOME/git/llvm-vanilla}
-
-
-set -e
-if [ -d "$HIPSYCL_LLVM_BUILD_DIR" ]; then
-       read -p  "The build directory already exists, do you want to use $HIPSYCL_LLVM_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ $REPLY =~ ^[Yy]$ ]]; then
-              echo "Using the exisiting directory"
-       else
-              echo "Please specify a different directory, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       fi
-else
-
-echo "Cloning LLVM $HIPSYCL_PKG_LLVM_REPO_BRANCH"
-git clone -b $HIPSYCL_PKG_LLVM_REPO_BRANCH https://github.com/llvm/llvm-project $HIPSYCL_LLVM_BUILD_DIR
-fi
-
-case $HIPSYCL_PKG_LLVM_VERSION in
-	9.0.1)
-		echo "Applying patch on $HIPSYCL_PKG_LLVM_VERSION"
-		sed -i 's/CHECK_SIZE_AND_OFFSET(ipc_perm, mode);//g' $HIPSYCL_LLVM_BUILD_DIR/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
-		;;
-esac
-
-
-CC=${HIPSYCL_BASE_CC:-clang}
-CXX=${HIPSYCL_BASE_CXX:-clang++}
-BUILD_TYPE=Release
-HIPSYCL_LLVM_INSTALL_PREFIX=$INSTALL_PREFIX/llvm
-TARGETS_TO_BUILD="AMDGPU;NVPTX;X86"
-NUMTHREADS=`nproc`
-
-CMAKE_OPTIONS="-DLLVM_ENABLE_PROJECTS=clang;compiler-rt;lld;openmp \
-               -DOPENMP_ENABLE_LIBOMPTARGET=OFF \
-               -DCMAKE_C_COMPILER=$CC \
-               -DCMAKE_CXX_COMPILER=$CXX \
-               -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-               -DCMAKE_INSTALL_PREFIX=$HIPSYCL_LLVM_INSTALL_PREFIX \
-               -DLLVM_ENABLE_ASSERTIONS=OFF \
-               -DLLVM_TARGETS_TO_BUILD=$TARGETS_TO_BUILD \
-               -DCLANG_ANALYZER_ENABLE_Z3_SOLVER=0 \
-               -DLLVM_INCLUDE_BENCHMARKS=0 \
-               -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON \
-               -DCMAKE_INSTALL_RPATH=$HIPSYCL_LLVM_INSTALL_PREFIX/lib \
-               -DLLVM_ENABLE_OCAMLDOC=OFF \
-               -DLLVM_ENABLE_BINDINGS=OFF \
-               -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=OFF \
-               -DLLVM_ENABLE_DUMP=OFF"
-
-mkdir -p $HIPSYCL_LLVM_BUILD_DIR/build
-cd $HIPSYCL_LLVM_BUILD_DIR/build
-cmake $CMAKE_OPTIONS $HIPSYCL_LLVM_BUILD_DIR/llvm
-make -j $NUMTHREADS
-make install
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/llvm-lit   $HIPSYCL_LLVM_INSTALL_PREFIX/bin/llvm-lit
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/FileCheck  $HIPSYCL_LLVM_INSTALL_PREFIX/bin/FileCheck
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/count      $HIPSYCL_LLVM_INSTALL_PREFIX/bin/count
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/not        $HIPSYCL_LLVM_INSTALL_PREFIX/bin/not
-cp -p $HIPSYCL_LLVM_BUILD_DIR/build/bin/yaml-bench $HIPSYCL_LLVM_INSTALL_PREFIX/yaml-bench
diff --git a/install/scripts/install-rocm.sh b/install/scripts/install-rocm.sh
deleted file mode 100644
index 67c882a36..000000000
--- a/install/scripts/install-rocm.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL}
-
-HIPSYCL_PKG_AOMP_RELEASE=${HIPSYCL_PKG_AOMP_VERSION:-0.7-7}
-HIPSYCL_PKG_AOMP_TAG=${HIPSYCL_PKG_AOMP_TAG:-rel_${HIPSYCL_PKG_AOMP_RELEASE}}
-
-set -e
-HIPSYCL_ROCM_BUILD_DIR=${HIPSYCL_ROCM_BUILD_DIR:-$HOME/git/aomp}
-
-export CC=${HIPSYCL_BASE_CC:-clang}
-export CXX=${HIPSYCL_BASE_CXX:-clang++}
-export SUDO=${SUDO:-"disable"}
-export AOMP=$HIPSYCL_INSTALL_PREFIX/rocm
-export BUILD_TYPE=Release
-#export NVPTXGPUS=60,61,62,70
-#export AOMP_BUILD_HIPSYCL_ESSENTIAL=1
-export AOMP_BUILD_HIP=1
-export CUDA=${CUDA:-$HIPSYCL_INSTALL_PREFIX/cuda}
-#export AOMP_BUILD_CUDA=1
-
-if [ -d "$HIPSYCL_ROCM_BUILD_DIR" ]; then
-       read -p  "The build directory already exists, do you want to use $HIPSYCL_ROCM_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ $REPLY =~ ^[Yy]$ ]]; then
-              echo "Using the exisiting directory"
-       else
-              echo "Please specify a different directory, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       fi
-else
-echo "Cloning aomp"
-git clone -b $HIPSYCL_PKG_AOMP_TAG https://github.com/ROCm-Developer-Tools/aomp $HIPSYCL_ROCM_BUILD_DIR/aomp
-cd $HIPSYCL_ROCM_BUILD_DIR/aomp/bin
-./clone_aomp.sh
-fi
-
-
-cd $HIPSYCL_ROCM_BUILD_DIR/aomp/bin
-case $HIPSYCL_PKG_AOMP_RELEASE in
-	0.7-7)
-    sed -i 's/openmp pgmath flang flang_runtime//g' $HIPSYCL_ROCM_BUILD_DIR/aomp/bin/build_aomp.sh
-    sed -i 's/exit 1//g' $HIPSYCL_ROCM_BUILD_DIR/aomp/bin/build_hcc.sh
-    # This aomp patch to support HIP in conjunction with OpenMP breaks HIP clang printf,
-    # so we remove it
-    sed -i 's/patch -p1 < $thisdir\/hip.patch//g' $HIPSYCL_ROCM_BUILD_DIR/aomp/bin/build_hip.sh
-
-    # Remove problematic -Werror compilation arguments
-    sed -i 's/ -Werror//g' $HIPSYCL_ROCM_BUILD_DIR/aomp-extras/hostcall/lib/CMakeLists.txt
-    sed -i 's/ -Werror//g' $HIPSYCL_ROCM_BUILD_DIR/rocr-runtime/src/CMakeLists.txt
-
-    # Remove for compatibility with glibc 2.31
-    sed -i 's/CHECK_SIZE_AND_OFFSET(ipc_perm, mode);//g' $HIPSYCL_ROCM_BUILD_DIR/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cc
-    sed -i 's/CHECK_SIZE_AND_OFFSET(ipc_perm, mode);//g' $HIPSYCL_ROCM_BUILD_DIR/hcc/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
-  ;;
-esac
-./build_aomp.sh
diff --git a/install/scripts/packaging/common/init.sh b/install/scripts/packaging/common/init.sh
deleted file mode 100644
index 6670c89ae..000000000
--- a/install/scripts/packaging/common/init.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-# define variables - version and build paths
-HIPSYCL_VERSION=24.02.0
-HIPSYCL_BUILD=`date +%Y%m%d`
-HIPSYCL_VERSION_STRING=${HIPSYCL_VERSION}-${HIPSYCL_BUILD}
-HIPSYCL_GPG_KEY=${HIPSYCL_GPG_KEY:-B2B75080}
-
-#BUILD_DIR=`mktemp -d`
-BUILD_DIR=${HIPSYCL_PACKAGING_DIR:-/tmp/hipsycl-packages}
-
-#Base packages
-CUDA_PKG=hipSYCL-base-cuda-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-ROCM_PKG=hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-COMMON_PKG=hipSYCL-base-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-
-#hipSYCL packages
-HIPSYCL_CORE_PKG=hipSYCL-core-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_CUDA_PKG=hipSYCL-cuda-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_ROCM_PKG=hipSYCL-rocm-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_OMP_PKG=hipSYCL-omp-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-
-#Meta packages
-HIPSYCL_META_PKG=hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-HIPSYCL_FULL_PKG=hipSYCL-full-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-
-
-echo "Building packages in directory ${BUILD_DIR}..."
-
-export CUDA_DIR=${BUILD_DIR}/${CUDA_PKG}
-export ROCM_DIR=${BUILD_DIR}/${ROCM_PKG}
-export COMMON_DIR=${BUILD_DIR}/${COMMON_PKG}
-
-export HIPSYCL_CORE_DIR=${BUILD_DIR}/${HIPSYCL_CORE_PKG}
-export HIPSYCL_CUDA_DIR=${BUILD_DIR}/${HIPSYCL_CUDA_PKG}
-export HIPSYCL_ROCM_DIR=${BUILD_DIR}/${HIPSYCL_ROCM_PKG}
-export HIPSYCL_OMP_DIR=${BUILD_DIR}/${HIPSYCL_OMP_PKG}
-
-export HIPSYCL_META_DIR=${BUILD_DIR}/${HIPSYCL_META_PKG}
-export HIPSYCL_FULL_DIR=${BUILD_DIR}/${HIPSYCL_FULL_PKG}
-
-# Make sure there are no residual files
-# from previous builds
-rm -rf ${CUDA_DIR}/opt || true
-rm -rf ${ROCM_DIR}/opt || true
-rm -rf ${COMMON_DIR}/opt || true
-
-rm -rf ${HIPSYCL_CORE_DIR} || true
-rm -rf ${HIPSYCL_CUDA_DIR} || true
-rm -rf ${HIPSYCL_ROCM_DIR} || true
-rm -rf ${HIPSYCL_OMP_DIR} || true
-
-# create build directories
-mkdir -p ${CUDA_DIR}/opt/hipSYCL/cuda
-mkdir -p ${ROCM_DIR}/opt/hipSYCL/rocm
-mkdir -p ${COMMON_DIR}/opt/hipSYCL
-
-mkdir -p ${HIPSYCL_CORE_DIR}/opt/hipSYCL
-mkdir -p ${HIPSYCL_CUDA_DIR}/opt/hipSYCL/lib/hipSYCL
-mkdir -p ${HIPSYCL_ROCM_DIR}/opt/hipSYCL/lib/hipSYCL
-mkdir -p ${HIPSYCL_OMP_DIR}/opt/hipSYCL/lib/hipSYCL
-
-
-# sort installed binaries into build paths
-cp -R /opt/hipSYCL/rocm/* ${ROCM_DIR}/opt/hipSYCL/rocm || true
-cp -R /opt/hipSYCL/llvm ${COMMON_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/boost ${COMMON_DIR}/opt/hipSYCL || true
-
-cp -R /opt/hipSYCL/bin     ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/etc     ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/include ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-cp -R /opt/hipSYCL/lib     ${HIPSYCL_CORE_DIR}/opt/hipSYCL || true
-rm -rf ${HIPSYCL_CORE_DIR}/opt/hipSYCL/lib/hipSYCL/* || true
-
-cp  /opt/hipSYCL/lib/hipSYCL/librt-backend-cuda.so ${HIPSYCL_CUDA_DIR}/opt/hipSYCL/lib/hipSYCL || true
-cp  /opt/hipSYCL/lib/hipSYCL/librt-backend-hip.so ${HIPSYCL_ROCM_DIR}/opt/hipSYCL/lib/hipSYCL || true
-cp  /opt/hipSYCL/lib/hipSYCL/librt-backend-omp.so ${HIPSYCL_OMP_DIR}/opt/hipSYCL/lib/hipSYCL || true
-
diff --git a/install/scripts/packaging/make-archlinux-cuda-pkg.sh b/install/scripts/packaging/make-archlinux-cuda-pkg.sh
deleted file mode 100644
index d61e6f233..000000000
--- a/install/scripts/packaging/make-archlinux-cuda-pkg.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-set -e
-
-. ./common/init.sh
-
-mkdir -p ${CUDA_DIR}/pkg
-cp ../install-cuda.sh ${CUDA_DIR}/pkg/
-
-
-cat << EOF > ${CUDA_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-cuda
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="CUDA stack for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('NVIDIA CUDA EULA')
-depends=('hipSYCL')
-source=('install-cuda.sh')
-md5sums=()
-validpgpkeys=()
-
-package(){
-  INSTALL_PREFIX=\$pkgdir/opt/hipSYCL sh ./install-cuda.sh
-}
-
-EOF
-
-cd ${CUDA_DIR}/pkg && makepkg -d -c --skipinteg
-
-
diff --git a/install/scripts/packaging/make-archlinux-pkg.sh b/install/scripts/packaging/make-archlinux-pkg.sh
deleted file mode 100644
index 178a46a81..000000000
--- a/install/scripts/packaging/make-archlinux-pkg.sh
+++ /dev/null
@@ -1,239 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-echo $HIPSYCL_GPG_KEY
-if [ -n "$HIPSYCL_GPG_KEY" ]; then
-	SIGN=" --sign --key $HIPSYCL_GPG_KEY"
-fi
-
-tar -cvf ${BUILD_DIR}/cuda-pkg.tar.gz -C ${CUDA_DIR} opt/
-tar -cvf ${BUILD_DIR}/rocm-pkg.tar.gz -C ${ROCM_DIR} opt/
-tar -cvf ${BUILD_DIR}/common-pkg.tar.gz -C ${COMMON_DIR} opt/
-
-tar -cvf ${BUILD_DIR}/hipsycl-core-pkg.tar.gz -C ${HIPSYCL_CORE_DIR} opt/
-tar -cvf ${BUILD_DIR}/hipsycl-cuda-pkg.tar.gz -C ${HIPSYCL_CUDA_DIR} opt/
-tar -cvf ${BUILD_DIR}/hipsycl-rocm-pkg.tar.gz -C ${HIPSYCL_ROCM_DIR} opt/
-tar -cvf ${BUILD_DIR}/hipsycl-omp-pkg.tar.gz -C ${HIPSYCL_OMP_DIR}  opt/
-
-mkdir -p ${CUDA_DIR}/pkg
-mkdir -p ${ROCM_DIR}/pkg
-mkdir -p ${COMMON_DIR}/pkg
-
-mkdir -p ${HIPSYCL_CORE_DIR}/pkg
-mkdir -p ${HIPSYCL_CUDA_DIR}/pkg
-mkdir -p ${HIPSYCL_ROCM_DIR}/pkg
-mkdir -p ${HIPSYCL_OMP_DIR}/pkg
-
-mkdir -p ${HIPSYCL_FULL_DIR}/pkg
-mkdir -p ${HIPSYCL_META_DIR}/pkg
-
-mv ${BUILD_DIR}/cuda-pkg.tar.gz ${CUDA_DIR}/pkg/
-mv ${BUILD_DIR}/rocm-pkg.tar.gz ${ROCM_DIR}/pkg/
-mv ${BUILD_DIR}/common-pkg.tar.gz ${COMMON_DIR}/pkg/
-
-mv ${BUILD_DIR}/hipsycl-core-pkg.tar.gz ${HIPSYCL_CORE_DIR}/pkg
-mv ${BUILD_DIR}/hipsycl-cuda-pkg.tar.gz ${HIPSYCL_CUDA_DIR}/pkg
-mv ${BUILD_DIR}/hipsycl-rocm-pkg.tar.gz ${HIPSYCL_ROCM_DIR}/pkg
-mv ${BUILD_DIR}/hipsycl-omp-pkg.tar.gz ${HIPSYCL_OMP_DIR}/pkg
-
-
-
-cat << EOF > ${HIPSYCL_CORE_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-core-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=('hipSYCL-omp-${HIPSYCL_PKG_TYPE}' 'python' )
-provides=('hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-core-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_CUDA_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="cuda backend for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=( 'hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-provides=('hipSYCL-cuda-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-cuda-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_ROCM_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-rocm-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="rocm backend for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=('hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}' 'hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-provides=('hipSYCL-rocm-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-rocm-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_OMP_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-omp-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="omp backend for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('BSD')
-depends=('hipSYCL-base-${HIPSYCL_PKG_TYPE}' 'hipSYCL-core-${HIPSYCL_PKG_TYPE}'  )
-provides=('hipSYCL-omp-${HIPSYCL_PKG_TYPE}' )
-source=('hipsycl-omp-pkg.tar.gz')
-md5sums=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${COMMON_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-base-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="LLVM compiler stack for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=('numactl')
-source=('common-pkg.tar.gz')
-md5sums=()
-validpgpkeys=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${ROCM_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="ROCm compiler stack and libraries for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=( 'pciutils' 'libelf' 'perl' 'pkg-config')
-provides=('hipSYCL-${HIPSYCL_PKG_TYPE}' 'SYCL-${HIPSYCL_PKG_TYPE}')
-source=('rocm-pkg.tar.gz')
-md5sums=()
-validpgpkeys=()
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-cat << EOF > ${HIPSYCL_FULL_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-full-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=( 'hipSYCL-${HIPSYCL_PKG_TYPE}' )
-provides=( 'hipSYCL-full-${HIPSYCL_PKG_TYPE}' )
-
-EOF
-
-cat << EOF > ${HIPSYCL_META_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('LLVM')
-depends=( 'hipSYCL-cuda-${HIPSYCL_PKG_TYPE}' 'hipSYCL-rocm-${HIPSYCL_PKG_TYPE}' 'hipSYCL-core-${HIPSYCL_PKG_TYPE}' )
-provides=( 'hipSYCL-${HIPSYCL_PKG_TYPE}' )
-
-EOF
-
-cat << EOF > ${CUDA_DIR}/pkg/PKGBUILD
-# Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-pkgname=hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-pkgver=${HIPSYCL_VERSION}
-pkgrel=${HIPSYCL_BUILD}
-pkgdesc="CUDA stack for hipSYCL"
-arch=('x86_64')
-url="https://github.com/illuhad/hipSYCL"
-license=('NVIDIA CUDA EULA')
-depends=()
-provides=('cuda')
-source=('cuda-pkg.tar.gz')
-
-
-package() {
-  cp -R \$srcdir/opt \$pkgdir
-}
-EOF
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON" ]; then
-cd ${HIPSYCL_CORE_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_CUDA_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_ROCM_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_OMP_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-
-cd ${HIPSYCL_META_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-cd ${HIPSYCL_FULL_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON" ]; then
-cd ${COMMON_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON" ]; then
-cd ${ROCM_DIR}/pkg && makepkg -d -c --skipinteg  $SIGN
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON" ]; then
-cd ${CUDA_DIR}/pkg && makepkg -d -c --skipinteg $SIGN
-echo $HIPSYCL_PKG_BUILD_CUDA
-fi
diff --git a/install/scripts/packaging/make-centos-7-pkg.sh b/install/scripts/packaging/make-centos-7-pkg.sh
deleted file mode 100644
index 0289d359b..000000000
--- a/install/scripts/packaging/make-centos-7-pkg.sh
+++ /dev/null
@@ -1,271 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-RPM_ROOT=${BUILD_DIR}/rpm
-mkdir -p ${RPM_ROOT}/{SOURCES,BUILD,RPMS,SPECS,SRPMS,tmp}
-
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-core-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-omp-${HIPSYCL_PKG_TYPE} 
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CORE_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/bin
-/opt/hipSYCL/lib
-/opt/hipSYCL/include
-/opt/hipSYCL/etc
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-Summary: cuda backend for hipSYCL
-Name: hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-Summary: rocm backend for hipSYCL
-Name: hipSYCL-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_ROCM_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-Summary: omp backend for hipSYCL
-Name: hipSYCL-omp-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_OMP_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-rocm-${HIPSYCL_PKG_TYPE},  hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-full-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base.spec
-Summary: base LLVM compiler stack for hipSYCL
-Name: hipSYCL-base-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-base-${HIPSYCL_VERSION_STRING}
-Requires: devtoolset-9, binutils, lbzip2
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${COMMON_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/llvm
-/opt/hipSYCL/boost
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base-rocm.spec
-Summary: ROCm stack for hipSYCL
-Name: hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-rocm-${HIPSYCL_VERSION_STRING}
-Requires: numactl-devel, numactl-libs, pciutils-devel, pciutils-libs, perl, elfutils-libelf-devel
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${ROCM_DIR}/* %{buildroot}
-  
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/rocm
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda.spec
-Summary: CUDA stack for hipSYCL
-Name: hipSYCL-cuda
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: NVIDIA CUDA EULA
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-cuda-${HIPSYCL_VERSION_STRING}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-
-%files
-/opt/hipSYCL/cuda
-
-EOF
-
-
-cd ${RPM_ROOT}/SPECS
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON"  ]; then
-rpmbuild -bb hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-
-rpmbuild -bb hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild -bb hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON"  ]; then
-rpmbuild -bb hipSYCL-base.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON"  ]; then
-rpmbuild -bb hipSYCL-base-rocm.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON"  ]; then
-rpmbuild -D '%_python_bytecompile_errors_terminate_build 0' -bb hipSYCL-cuda.spec
-fi
diff --git a/install/scripts/packaging/make-centos-8-pkg.sh b/install/scripts/packaging/make-centos-8-pkg.sh
deleted file mode 100644
index b4d706284..000000000
--- a/install/scripts/packaging/make-centos-8-pkg.sh
+++ /dev/null
@@ -1,282 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-RPM_ROOT=${BUILD_DIR}/rpm
-mkdir -p ${RPM_ROOT}/{SOURCES,BUILD,RPMS,SPECS,SRPMS,tmp}
-
-# We need to use %undefine __brp_mangle_shebangs
-# since llvm contains ambigous python shebangs 
-# Probably fixing these here is not the best idea
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-core-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-omp-${HIPSYCL_PKG_TYPE} 
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CORE_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/bin
-/opt/hipSYCL/lib
-/opt/hipSYCL/include
-/opt/hipSYCL/etc
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-Summary: cuda backend for hipSYCL
-Name: hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-Summary: rocm backend for hipSYCL
-Name: hipSYCL-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils 
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_ROCM_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-
-EOF
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-Summary: omp backend for hipSYCL
-Name: hipSYCL-omp-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_PKG_TYPE}-${HIPSYCL_VERSION_STRING}
-Requires: python3, hipSYCL-base-${HIPSYCL_PKG_TYPE}, hipSYCL-core-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${HIPSYCL_OMP_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/lib/hipSYCL
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-rocm-${HIPSYCL_PKG_TYPE},  hipSYCL-cuda-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-Summary: Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs
-Name: hipSYCL-full-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: BSD
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-${HIPSYCL_VERSION_STRING}
-Requires: hipSYCL-${HIPSYCL_PKG_TYPE}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base.spec
-Summary: base LLVM compiler stack for hipSYCL
-Name: hipSYCL-base-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-base-${HIPSYCL_VERSION_STRING}
-Requires: binutils, lbzip2, gcc-toolset-9-toolchain
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${COMMON_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/llvm
-/opt/hipSYCL/boost
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-base-rocm.spec
-Summary: ROCm stack for hipSYCL
-Name: hipSYCL-base-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: LLVM
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-rocm-${HIPSYCL_VERSION_STRING}
-Requires: numactl-devel, numactl-libs, pciutils-devel, pciutils-libs, perl, elfutils-libelf-devel
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${ROCM_DIR}/* %{buildroot}
-  
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/rocm
-
-EOF
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda.spec
-Summary: CUDA stack for hipSYCL
-Name: hipSYCL-cuda
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: NVIDIA CUDA EULA
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-cuda-${HIPSYCL_VERSION_STRING}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${CUDA_DIR}/* %{buildroot}
-
-%global __python %{__python3}
-%undefine __brp_mangle_shebangs
-
-%files
-/opt/hipSYCL/cuda
-
-EOF
-
-
-cd ${RPM_ROOT}/SPECS
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-full-${HIPSYCL_PKG_TYPE}.spec
-
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-core-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-cuda-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-rocm-${HIPSYCL_PKG_TYPE}.spec
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-omp-${HIPSYCL_PKG_TYPE}.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-base.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -bb hipSYCL-base-rocm.spec
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON"  ]; then
-rpmbuild --define "_topdir $HIPSYCL_PACKAGING_DIR" -D '%_python_bytecompile_errors_terminate_build 0' -bb hipSYCL-cuda.spec
-fi
diff --git a/install/scripts/packaging/make-centos-cuda-pkg.sh b/install/scripts/packaging/make-centos-cuda-pkg.sh
deleted file mode 100644
index 52e345fa5..000000000
--- a/install/scripts/packaging/make-centos-cuda-pkg.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-
-RPM_ROOT=${BUILD_DIR}/rpm
-mkdir -p ${RPM_ROOT}/{SOURCES,BUILD,RPMS,SPECS,SRPMS,tmp}
-
-rm -rf ${CUDA_DIR}/*
-INSTALL_PREFIX=${CUDA_DIR}/opt/hipSYCL sh ../install-cuda.sh
-rm -rf ${CUDA_DIR}/opt/hipSYCL/cuda/samples
-
-cat << EOF > ${RPM_ROOT}/SPECS/hipSYCL-cuda.spec
-Summary: CUDA stack for hipSYCL
-Name: hipSYCL-cuda
-Version: ${HIPSYCL_VERSION}
-Release: ${HIPSYCL_BUILD}
-License: NVIDIA CUDA EULA
-Packager: Aksel Alpay
-Group: Development/Tools
-BuildRequires: coreutils
-BuildRoot: ${RPM_ROOT}/tmp/hipSYCL-cuda-${HIPSYCL_VERSION_STRING}
-AutoReq: no
-
-%description
-%{summary}
-
-%install
-cp -R ${CUDA_DIR}/* %{buildroot}
-
-
-%files
-/opt/hipSYCL/cuda
-
-EOF
-
-
-cd ${RPM_ROOT}/SPECS
-rpmbuild -D '%_python_bytecompile_errors_terminate_build 0' -bb hipSYCL-cuda.spec
-
diff --git a/install/scripts/packaging/make-ubuntu-cuda-pkg.sh b/install/scripts/packaging/make-ubuntu-cuda-pkg.sh
deleted file mode 100644
index c96a2d045..000000000
--- a/install/scripts/packaging/make-ubuntu-cuda-pkg.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-. ./common/init.sh
-
-mkdir -p ${CUDA_DIR}/DEBIAN
-
-cat << EOF > ${CUDA_DIR}/DEBIAN/control
-Package: hipsycl-cuda
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Depends: hipSYCL
-Description: CUDA stack for hipSYCL
- Provides CUDA toolkit for hipSYCL
-EOF
-
-INSTALL_PREFIX=${CUDA_DIR}/opt/hipSYCL sh ../install-cuda.sh
-
-cd ${BUILD_DIR}
-dpkg-deb --build ${CUDA_PKG}
-
diff --git a/install/scripts/packaging/make-ubuntu-pkg.sh b/install/scripts/packaging/make-ubuntu-pkg.sh
deleted file mode 100644
index 833e02d9f..000000000
--- a/install/scripts/packaging/make-ubuntu-pkg.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/bin/bash
-# Intended to be executed inside the built singularity container
-
-set -e
-
-. ./common/init.sh
-
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-HIPSYCL_PKG_BUILD_HIPSYCL=${HIPSYCL_PKG_BUILD_HIPSYCL:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-OFF}
-
-
-mkdir -p ${CUDA_DIR}/DEBIAN
-mkdir -p ${ROCM_DIR}/DEBIAN
-mkdir -p ${COMMON_DIR}/DEBIAN
-
-mkdir -p ${HIPSYCL_CORE_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_CUDA_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_ROCM_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_OMP_DIR}/DEBIAN
-
-mkdir -p ${HIPSYCL_META_DIR}/DEBIAN
-mkdir -p ${HIPSYCL_FULL_DIR}/DEBIAN
-
-cat << EOF > ${HIPSYCL_CORE_DIR}/DEBIAN/control 
-Package: hipsycl-core-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-omp-${HIPSYCL_PKG_TYPE}, python3 (>= 3.0)
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
- Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs 
-EOF
-
-cat << EOF > ${HIPSYCL_CUDA_DIR}/DEBIAN/control 
-Package: hipsycl-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-core-${HIPSYCL_PKG_TYPE}
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
-  Cuda backend for hipSYCL
-EOF
-
-cat << EOF > ${HIPSYCL_ROCM_DIR}/DEBIAN/control 
-Package: hipsycl-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-core-${HIPSYCL_PKG_TYPE}, hipsycl-base-rocm-${HIPSYCL_PKG_TYPE} , python3 (>= 3.0)
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
-  Rocm backend for hipSYCL
-EOF
-
-cat << EOF > ${HIPSYCL_OMP_DIR}/DEBIAN/control 
-Package: hipsycl-omp-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-core-${HIPSYCL_PKG_TYPE}, hipsycl-base-${HIPSYCL_PKG_TYPE} , python3 (>= 3.0)
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL${HIPSYCL_VERSION_STRING}
-  omp backend for hipSYCL
-EOF
-
-cat << EOF > ${COMMON_DIR}/DEBIAN/control
-Package: hipsycl-base-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: g++-9, libnuma1, build-essential
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: hipSYCL base compiler stack
- Provides an LLVM compiler stack for hipSYCL
-EOF
-
-cat << EOF > ${ROCM_DIR}/DEBIAN/control
-Package: hipsycl-base-rocm-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends:  libpci-dev
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: ROCm compiler stack for hipSYCL-${HIPSYCL_PKG_TYPE}  Provides ROCm libraries for hipSYCL
-EOF
-
-
-cat << EOF > ${HIPSYCL_FULL_DIR}/DEBIAN/control
-Package: hipsycl-full-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-rocm-${HIPSYCL_PKG_TYPE},  hipsycl-cuda-${HIPSYCL_PKG_TYPE}
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description:  Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs 
-
-EOF
-
-cat << EOF > ${HIPSYCL_META_DIR}/DEBIAN/control
-Package: hipsycl-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Depends: hipsycl-full-${HIPSYCL_PKG_TYPE}
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description:  Implementation of Khronos SYCL for CPUs, AMD GPUs and NVIDIA GPUs 
-
-EOF
-
-cat << EOF > ${CUDA_DIR}/DEBIAN/control
-Package: hipsycl-base-cuda-${HIPSYCL_PKG_TYPE}
-Version: ${HIPSYCL_VERSION_STRING}
-Section: base
-Priority: optional
-Architecture: amd64
-Maintainer: Aksel Alpay <aksel.alpay@uni-heidelberg.de>
-Description: CUDA stack for hipSYCL
- Provides CUDA toolkit for hipSYCL
-EOF
-
-cd ${BUILD_DIR}
-
-if [ "$HIPSYCL_PKG_BUILD_ROCM" = "ON" ]; then
-dpkg-deb --build ${ROCM_PKG}
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_BASE" = "ON"  ]; then
-dpkg-deb --build ${COMMON_PKG}
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_HIPSYCL" = "ON" ]; then
-
-dpkg-deb --build ${HIPSYCL_CORE_DIR} 
-dpkg-deb --build ${HIPSYCL_CUDA_DIR} 
-dpkg-deb --build ${HIPSYCL_ROCM_DIR} 
-dpkg-deb --build ${HIPSYCL_OMP_DIR} 
-
-dpkg-deb --build ${HIPSYCL_META_PKG}
-dpkg-deb --build ${HIPSYCL_FULL_PKG}
-fi
-
-if [ "$HIPSYCL_PKG_BUILD_CUDA" = "ON" ]; then
-dpkg-deb --build ${CUDA_PKG}
-fi
-
diff --git a/install/scripts/rebuild-images.sh b/install/scripts/rebuild-images.sh
deleted file mode 100644
index 2cdfb9a17..000000000
--- a/install/scripts/rebuild-images.sh
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-
-if [ "$#" -ne 2 ]; then
-  echo "
-  Usage: <distro> <[script to run] OR build>
-    distro: the distro to install the software for
-    script_to_run: Execute the install script located in HIPSYCL_PKG_SCRIPT_DIR, ( by defauld install/scripts)
-    build: Build the base image into the HIPSYCL_PKG_CONTAINER_DIR folder. the scripts that are necessary are copid to the image
-        during the build time the definition file is located at: HIPSYCL_PKG_SCRIPT_DIR/base-<distro>.def
-
-  Important ENV variables:
-    - HIPSYCL_PKG_CONTAINER_DIR
-  "
-  exit -1
-fi
-distro=$1
-candidate=$2
-
-HIPSYCL_PKG_SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-HIPSYCL_PKG_CONTAINER_DIR=${HIPSYCL_PKG_CONTAINER_DIR:-$HIPSYCL_PKG_SCRIPT_DIR/containers}
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-release/9.x}
-echo $HIPSYCL_PKG_CONTAINER_DIR
-HIPSYCL_PKG_BUILD_CUDA=${HIPSYCL_PKG_BUILD_CUDA:-ON}
-HIPSYCL_PKG_BUILD_ROCM=${HIPSYCL_PKG_BUILD_ROCM:-ON}
-HIPSYCL_PKG_BUILD_BASE=${HIPSYCL_PKG_BUILD_BASE:-ON}
-
-SINGULARITYENV_HIPSYCL_PKG_BUILD_CUDA=$HIPSYCL_PKG_BUILD_CUDA
-SINGULARITYENV_HIPSYCL_PKG_BUILD_ROCM=$HIPSYCL_PKG_BUILD_ROCM
-SINGULARITYENV_HIPSYCL_PKG_BUILD_BASE=$HIPSYCL_PKG_BUILD_BASE
-SINGULARITYENV_HIPSYCL_PKG_LLVM_REPO_BRANCH=$HIPSYCL_PKG_LLVM_REPO_BRANCH
-SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MAJOR=$HIPSYCL_PKG_LLVM_VERSION_MAJOR
-SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MINOR=$HIPSYCL_PKG_LLVM_VERSION_MINOR
-SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_PATCH=$HIPSYCL_PKG_LLVM_VERSION_PATCH
-SINGULARITYENV_HIPSYCL_PKG_AOMP_RELEASE=$HIPSYCL_PKG_AOMP_RELEASE
-SINGULARITYENV_HIPSYCL_PKG_AOMP_TAG=$HIPSYCL_PKG_AOMP_TAG
-SINGULARITYENV_HIPSYCL_BUILD_DIR_PREFIX="/tmp/hipsycl-build-for-"
-SINGULARITYENV_HIPSYCL_BUILD_DIR=$SINGULARITYENV_HIPSYCL_BUILD_DIR_PREFIX$distro
-
-export SINGULARITYENV_HIPSYCL_PKG_BUILD_CUDA
-export SINGULARITYENV_HIPSYCL_PKG_BUILD_ROCM
-export SINGULARITYENV_HIPSYCL_PKG_BUILD_BASE
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_REPO_BRANCH 
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MAJOR
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_MINOR 
-export SINGULARITYENV_HIPSYCL_PKG_LLVM_VERSION_PATCH 
-export SINGULARITYENV_HIPSYCL_PKG_AOMP_RELEASE
-export SINGULARITYENV_HIPSYCL_PKG_AOMP_TAG
-export SINGULARITYENV_HIPSYCL_BUILD_DIR
-
-
-echo $HIPSYCL_PKG_CONTAINER_DIR
-cd $HIPSYCL_PKG_SCRIPT_DIR
-mkdir -p $HIPSYCL_PKG_CONTAINER_DIR
-mkdir -p /tmp/hipsycl-pkg-builder
-
-if [ "$candidate" = "build" ]; then
-  echo "Building $distro image... with base pkgs"
-  singularity build --fakeroot --sandbox -F $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro base-definitions/$distro.def
-  echo "Building $distro hipSYCL base via spack"
-elif [ "$candidate" = "cleanup" ]; then
-  rm -rf $SINGULARITYENV_HIPSYCL_BUILD_DIR
-else
-  #echo $HIPSYCL_PKG_LLVM_VERSION_MAJOR
-  tmpdir=$HIPSYCL_PKG_CONTAINER_DIR/tmp-$distro
-  mkdir -p $tmpdir
-  singularity -d exec --fakeroot --writable --no-home  -B $HIPSYCL_PKG_SCRIPT_DIR:/mnt \
-  $HIPSYCL_PKG_CONTAINER_DIR/hipsycl-$distro \
-  bash /mnt/$candidate.sh
-fi
diff --git a/install/scripts/spack-install/boost.sh b/install/scripts/spack-install/boost.sh
deleted file mode 100644
index 5943aee11..000000000
--- a/install/scripts/spack-install/boost.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-if [ ! -d ./spack ]; then
-  git clone https://github.com/spack/spack.git #-b v0.16.1
-fi
-export SPACK_ROOT=/root/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/boost/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 16|' spack/etc/spack/defaults/config.yaml
-. $SPACK_ROOT/share/spack/setup-env.sh
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-# Spack distributed build in this form causes Timeouts sometimes.... maybe use a upstream solution... yeah probably.... 
-
-parallel --joblog /tmp/spack-install-boost.exit --lb -N0 spack install boost%clang@$llvm_version context=True fiber=True target=x86_64 cxxstd=11 ::: {1..16} || error=1
-if [ "$error" = "1" ]; then 
-  spack install boost%clang@$llvm_version context=True fiber=True target=x86_64 cxxstd=11
-fi
-spack gc -y
-
diff --git a/install/scripts/spack-install/cuda.sh b/install/scripts/spack-install/cuda.sh
deleted file mode 120000
index cb5a0b63e..000000000
--- a/install/scripts/spack-install/cuda.sh
+++ /dev/null
@@ -1 +0,0 @@
-../install-cuda.sh
\ No newline at end of file
diff --git a/install/scripts/spack-install/hipsycl.sh b/install/scripts/spack-install/hipsycl.sh
deleted file mode 100644
index 9872251f3..000000000
--- a/install/scripts/spack-install/hipsycl.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-HIPSYCL_PKG_LLVM_REPO_BRANCH=${HIPSYCL_PKG_LLVM_REPO_BRANCH:-release/${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.x}
-
-export HIPSYCL_INSTALL_PREFIX=${HIPSYCL_INSTALL_PREFIX:-/opt/hipSYCL/}
-
-set -e
-HIPSYCL_BUILD_DIR=${HIPSYCL_BUILD_DIR:-/tmp/hipsycl-installer-hipsyclbuildbot}
-HIPSYCL_REPO_USER=${HIPSYCL_REPO_USER:-illuhad}
-HIPSYCL_REPO_BRANCH=${HIPSYCL_REPO_BRANCH:-develop}
-HIPSYCL_WITH_CUDA=${HIPSYCL_WITH_CUDA:-ON}
-HIPSYCL_WITH_ROCM=${HIPSYCL_WITH_ROCM:-ON}
-
-LLVM_INCLUDE_PATH=$HIPSYCL_INSTALL_PREFIX/llvm/llvm/lib/clang/${HIPSYCL_PKG_LLVM_VERSION_MAJOR}.\
-${HIPSYCL_PKG_LLVM_VERSION_MINOR}.\
-${HIPSYCL_PKG_LLVM_VERSION_PATCH}/include
-if [ -d "$HIPSYCL_BUILD_DIR" ]; then
-       read -p  "hipsycl_installer: The build directory already exists, do you want to use $HIPSYCL_BUILD_DIR anyways?[y]" -n 1 -r
-       echo 
-       if [[ ! $REPLY =~ ^[Yy]$ ]]; then
-              echo "hipsycl_installer: Please specify a different directory than $HIPSYCL_BUILD_DIR, exiting"
-              [[ "$0" = "$BASH_SOURCE" ]] && exit 1 || return 1
-       else
-              echo "hipsycl_installer: Using the exisiting directory"
-       fi
-else
-echo "hipsycl_installer: Cloning hipSYCL"
-git clone --recurse-submodules -b $HIPSYCL_REPO_BRANCH https://github.com/$HIPSYCL_REPO_USER/hipSYCL $HIPSYCL_BUILD_DIR
-
-fi
-
-mkdir -p $HIPSYCL_BUILD_DIR/build
-cd $HIPSYCL_BUILD_DIR/build
-
-# We need the llvm module to be loaded in order to be able to find the openmp rt
-export SPACK_ROOT=/root/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-
-source /etc/profile
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/llvm/|' $SPACK_ROOT/etc/spack/defaults/config.yaml
-spack load --only package llvm
-rocm_path=/opt/hipSYCL/rocm/
-
-cmake \
--DCMAKE_C_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang \
--DCMAKE_CXX_COMPILER=/opt/hipSYCL/llvm/llvm/bin/clang++ \
--DWITH_CPU_BACKEND=ON \
--DWITH_CUDA_BACKEND=$HIPSYCL_WITH_CUDA \
--DWITH_ROCM_BACKEND=$HIPSYCL_WITH_ROCM \
--DLLVM_DIR=/opt/hipSYCL/llvm/llvm/ \
--DROCM_PATH=/opt/hipSYCL/rocm/hip/ \
--DBOOST_ROOT=/opt/hipSYCL/boost/boost/ \
--DCUDA_TOOLKIT_ROOT_DIR=/opt/hipSYCL/cuda \
--DCLANG_EXECUTABLE_PATH=/opt/hipSYCL/llvm/llvm/bin/clang++ \
--DCLANG_INCLUDE_PATH=$LLVM_INCLUDE_PATH \
--DCMAKE_INSTALL_PREFIX=$HIPSYCL_INSTALL_PREFIX \
--DCMAKE_PREFIX_PATH="$rocm_path/comgr/lib/cmake;$rocm_path/rocm-device-libs/lib/cmake;$rocm_path/hsa-rocr-dev/lib/cmake;$rocm_path/hsa-rocr-dev/;$rocm_path/hip/lib/cmake" \
-..
-
-make -j 16 install
-cp /mnt/spack-install/spack-syclcc.json /opt/hipSYCL/etc/hipSYCL/syclcc.json 
diff --git a/install/scripts/spack-install/llvm.sh b/install/scripts/spack-install/llvm.sh
deleted file mode 100644
index ec1e705fa..000000000
--- a/install/scripts/spack-install/llvm.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-if [ ! -d ./spack ]; then
-  git clone https://github.com/spack/spack.git #-b v0.16.1
-  # git clone https://github.com/spack/spack.git spack_upstream
-  # sed -i 's|- $spack/var/spack/repos/builtin|- /root/spack_upstream/var/spack/repos/builtin|' spack/etc/spack/defaults/repos.yaml
-fi
-export SPACK_ROOT=/root/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-. $SPACK_ROOT/share/spack/setup-env.sh
-spack compiler find || echo "No new compilers"; spack compilers
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/llvm/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 16|' spack/etc/spack/defaults/config.yaml
-sed -i 's|projects.append("clang-tools-extra")|#projects.append("clang-tools-extra")|' spack/var/spack/repos/builtin/packages/llvm/package.py
-
-. $SPACK_ROOT/share/spack/setup-env.sh
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-parallel --lb -N0 spack install llvm@$llvm_version cuda=False libcxx=True polly=False lldb=False lld=True internal_unwind=False gold=False target=x86_64 build_type=MinSizeRel -flang ::: {1..16} || error=1
-if [ "$error" = "1" ]; then 
-  spack install llvm@$llvm_version -flang cuda=False libcxx=True polly=False lldb=False lld=True internal_unwind=False gold=False target=x86_64 build_type=MinSizeRel 
-fi
-#spack install llvm@$llvm_version cuda=False libcxx=False target=x86_64
-spack load llvm
-spack compiler find /opt/hipSYCL/llvm/llvm/
-spack unload llvm
-
diff --git a/install/scripts/spack-install/rocm.sh b/install/scripts/spack-install/rocm.sh
deleted file mode 100644
index 7623aa752..000000000
--- a/install/scripts/spack-install/rocm.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -e
-set -o xtrace
-HIPSYCL_PKG_LLVM_VERSION_MAJOR=${HIPSYCL_PKG_LLVM_VERSION_MAJOR:-11}
-HIPSYCL_PKG_LLVM_VERSION_MINOR=${HIPSYCL_PKG_LLVM_VERSION_MINOR:-0}
-HIPSYCL_PKG_LLVM_VERSION_PATCH=${HIPSYCL_PKG_LLVM_VERSION_PATCH:-0}
-HIPSYCL_HIP_VERSION=${HIPSYCL_HIP_VERSION:-4.0.0}
-
-llvm_version=$HIPSYCL_PKG_LLVM_VERSION_MAJOR.$HIPSYCL_PKG_LLVM_VERSION_MINOR.$HIPSYCL_PKG_LLVM_VERSION_PATCH
-if [ ! -d ./spack ]; then
-  git clone https://github.com/spack/spack.git #-b v0.16.1
-  # git clone https://github.com/spack/spack.git spack_upstream
-  # echo "upstreams:
-  # spack-instance-1:
-  #   install_tree: /root/spack_upstream/opt/spack" > /root/spack/etc/spack/defaults/upstreams.yaml
-fi
-export SPACK_ROOT=~/spack
-export PATH=$SPACK_ROOT/bin:$PATH
-. $SPACK_ROOT/share/spack/setup-env.sh
-
-sed -i 's|root: .*$|root: /opt/hipSYCL/rocm/|' spack/etc/spack/defaults/config.yaml
-sed -i 's|all: .*$|all: ${PACKAGE}|' spack/etc/spack/defaults/config.yaml
-sed -i 's|# build_jobs: .*$|build_jobs: 16|' spack/etc/spack/defaults/config.yaml
-. $SPACK_ROOT/share/spack/setup-env.sh
-spack compiler find /opt/hipSYCL/llvm/llvm/bin/
-
-# Somteimes some parallel instances exit due to waiting too long for a lock
-# In case that happens we run the sequential version to check if everything have been
-# installed properly
-parallel --lb -N0 spack install hip@$HIPSYCL_HIP_VERSION%clang@$llvm_version target=x86_64 ::: {1..16} || error="1"
-if [ "$error" = "1" ]; then 
-  spack install hip%clang@$llvm_version target=x86_64
-fi
-spack gc -y
-
diff --git a/install/scripts/spack-install/spack-syclcc.json b/install/scripts/spack-install/spack-syclcc.json
deleted file mode 100644
index 2216abd94..000000000
--- a/install/scripts/spack-install/spack-syclcc.json
+++ /dev/null
@@ -1,19 +0,0 @@
-{
-  "default-clang"     : "/opt/hipSYCL/llvm/llvm/bin/clang++",
-  "default-platform"  : "cuda",
-  "default-cuda-path" : "/opt/hipSYCL/cuda",
-  "default-gpu-arch"  : "",
-  "default-cpu-cxx"   : "/opt/hipSYCL/llvm/llvm/bin/clang++",
-  "default-rocm-path" : "/opt/hipSYCL/rocm",
-  "default-use-bootstrap-mode" : "false",
-  "default-is-dryrun" : "false",
-  "default-clang-include-path" : "/opt/hipSYCL/llvm/llvm/lib/clang/11.0.0/include/..",
-  "default-sequential-link-line" : "-L/opt/hipSYCL/boost/boost/lib -lboost_context -lboost_fiber -lomp  -Wl,-rpath=/opt/hipSYCL/boost/boost/lib",
-  "default-sequential-cxx-flags" : "-I/opt/hipSYCL/boost/boost/include",
-  "default-omp-link-line" : "-L/opt/hipSYCL/boost/boost/lib -lboost_context -lboost_fiber -Wl,-rpath=/opt/hipSYCL/boost/boost/lib -Wl,-rpath=/opt/hipSYCL/llvm/llvm/lib -fopenmp",
-  "default-omp-cxx-flags" : "-I/opt/hipSYCL/boost/boost/include -fopenmp",
-  "default-rocm-link-line" : "-Wl,-rpath=$HIPSYCL_ROCM_PATH/lib -Wl,-rpath=$HIPSYCL_ROCM_PATH/hip/lib -Wl,-rpath=/opt/hipSYCL/llvm/llvm/lib -L/opt/hipSYCL/rocm/lib -L/opt/hipSYCL/rocm/hip/lib -lamdhip64",
-  "default-rocm-cxx-flags" : "-isystem /opt/hipSYCL/llvm/llvm/lib/clang/11.0.0/include/.. -U__FLOAT128__ -U__SIZEOF_FLOAT128__ -I$HIPSYCL_ROCM_PATH/hsa-rocr-dev/include -I$HIPSYCL_ROCM_PATH/hip/include --rocm-path=$HIPSYCL_ROCM_PATH/rocm-device-libs",
-  "default-cuda-link-line" : "-Wl,-rpath=$HIPSYCL_CUDA_LIB_PATH -Wl,-rpath=/opt/hipSYCL/llvm/llvm/lib -L$HIPSYCL_CUDA_LIB_PATH -lcudart",
-  "default-cuda-cxx-flags" : "-U__FLOAT128__ -U__SIZEOF_FLOAT128__"
-}
diff --git a/mkdocs.yml b/mkdocs.yml
index 915b75fa0..2e2001cc5 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -10,13 +10,14 @@ nav:
 
 
   - 'Usage' :
-      - 'Using AdaptiveCpp' : 'using-hipsycl.md'
+      - 'Using AdaptiveCpp' : 'using-acpp.md'
       - 'Compilation model' : 'compilation.md'
       - 'Env variables' : 'env_variables.md'
       - 'Performance guide' : 'performance.md'
       - 'Macros' : 'macros.md'
       - 'SYCL interoperability' : 'hip-source-interop.md'
       - 'C++ standard parallelism offloading (stdpar)' : 'stdpar.md'
+      - 'AdaptiveCpp parallel algorithms library' : 'algorithms.md'
 
   - 'AdaptiveCpp design' : 
       - 'Architecture' : 'architecture.md'
@@ -73,4 +74,4 @@ markdown_extensions:
 
   - admonition
   - pymdownx.details
-  - pymdownx.superfences
\ No newline at end of file
+  - pymdownx.superfences
diff --git a/src/compiler/CMakeLists.txt b/src/compiler/CMakeLists.txt
index 96eb97f29..f9ebb1d39 100644
--- a/src/compiler/CMakeLists.txt
+++ b/src/compiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.10)
 project(acpp-clang)
 
 get_filename_component(CLANG_BINARY_PREFIX ${CLANG_EXECUTABLE_PATH} DIRECTORY)
@@ -36,13 +36,13 @@ if(WITH_ACCELERATED_CPU OR WITH_SSCP_COMPILER)
   add_library(acpp-clang-cbs OBJECT
     ${CBS_PLUGIN}
   )
-  
+
   set_property(TARGET acpp-clang-cbs PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
 if(WITH_SSCP_COMPILER)
   set(WITH_REFLECTION_BUILTINS ON)
-  set(SSCP_COMPILER 
+  set(SSCP_COMPILER
     sscp/KernelOutliningPass.cpp
     sscp/IRConstantReplacer.cpp
     sscp/DynamicFunctionSupport.cpp
@@ -75,7 +75,7 @@ else()
   set(REFLECTION_BUILTINS "")
 endif()
 
-add_library(acpp-clang SHARED
+add_library(acpp-clang MODULE
   AdaptiveCppClangPlugin.cpp
   GlobalsPruningPass.cpp
   ${SSCP_COMPILER}
@@ -102,8 +102,11 @@ function(configure_target)
     ${CMAKE_BINARY_DIR}/include)
   target_include_directories(${target} SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
 
+  separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
+  target_compile_definitions(${target} PRIVATE
+    ${LLVM_DEFINITIONS_LIST})
   target_compile_definitions(${target} PRIVATE
-    ${LLVM_DEFINITIONS} -DHIPSYCL_COMPILER_COMPONENT)
+    -DHIPSYCL_COMPILER_COMPONENT)
 
   if(ROCM_VERSION_MAJOR)
     target_compile_definitions(${target} PRIVATE -DROCM_CLANG_VERSION_MAJOR=${ROCM_VERSION_MAJOR} -DROCM_CLANG_VERSION_MINOR=${ROCM_VERSION_MINOR} -DROCM_CLANG_VERSION_PATCH=${ROCM_VERSION_PATCH})
diff --git a/src/compiler/cbs/IRUtils.cpp b/src/compiler/cbs/IRUtils.cpp
index 7b598e5b2..64c022856 100644
--- a/src/compiler/cbs/IRUtils.cpp
+++ b/src/compiler/cbs/IRUtils.cpp
@@ -27,6 +27,24 @@
 namespace hipsycl::compiler::utils {
 using namespace hipsycl::compiler::cbs;
 
+void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm::Value *To, llvm::StringRef LogPrefix) {
+  auto M = F.getParent();
+  auto GV = M->getGlobalVariable(GlobalVarName);
+  if (!GV)
+    return;
+
+  HIPSYCL_DEBUG_INFO << LogPrefix << "RUOGVW: " << *GV << " with " << *To << "\n";
+  llvm::SmallVector<llvm::Instruction *> ToErase;
+  for (auto U : GV->users()) {
+    if (auto I = llvm::dyn_cast<llvm::LoadInst>(U); I && I->getFunction() == &F) {
+      HIPSYCL_DEBUG_INFO << LogPrefix << "RUOGVW: " << *I << " with " << *To << "\n";
+      I->replaceAllUsesWith(To);
+    }
+  }
+  for (auto I : ToErase)
+    I->eraseFromParent();
+}
+
 llvm::Loop *updateDtAndLi(llvm::LoopInfo &LI, llvm::DominatorTree &DT, const llvm::BasicBlock *B,
                           llvm::Function &F) {
   DT.reset();
diff --git a/src/compiler/cbs/LoopSplitterInlining.cpp b/src/compiler/cbs/LoopSplitterInlining.cpp
index 2e27d141f..bcef6bb15 100644
--- a/src/compiler/cbs/LoopSplitterInlining.cpp
+++ b/src/compiler/cbs/LoopSplitterInlining.cpp
@@ -11,6 +11,7 @@
 #include "hipSYCL/compiler/cbs/LoopSplitterInlining.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
 #include "hipSYCL/compiler/cbs/SplitterAnnotationAnalysis.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 #include "hipSYCL/common/debug.hpp"
 
@@ -119,7 +120,8 @@ bool fillTransitiveSplitterCallers(llvm::Function &F,
   std::transform(F.begin(), F.end(), std::back_inserter(Blocks), [](auto &BB) { return &BB; });
 
   if (fillTransitiveSplitterCallers(Blocks, SAA, FuncsWSplitter,
-                                    InIntrinsic || F.getName().startswith("__acpp_sscp"))) {
+                                    InIntrinsic ||
+				    hipsycl::llvmutils::starts_with(F.getName(), "__acpp_sscp"))) {
     FuncsWSplitter.insert(&F);
     return true;
   }
diff --git a/src/compiler/cbs/SubCfgFormation.cpp b/src/compiler/cbs/SubCfgFormation.cpp
index b04ad82c1..8b9e92212 100644
--- a/src/compiler/cbs/SubCfgFormation.cpp
+++ b/src/compiler/cbs/SubCfgFormation.cpp
@@ -127,7 +127,8 @@ getLocalSizeArgumentFromAnnotation(llvm::Function &F) {
   for (auto &BB : F)
     for (auto &I : BB)
       if (auto *UI = llvm::dyn_cast<llvm::CallInst>(&I))
-        if (UI->getCalledFunction()->getName().startswith("llvm.var.annotation")) {
+        if (hipsycl::llvmutils::starts_with(UI->getCalledFunction()->getName(),
+                                            "llvm.var.annotation")) {
           HIPSYCL_DEBUG_INFO << *UI << '\n';
           llvm::GlobalVariable *AnnotateStr = nullptr;
           if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(UI->getOperand(1));
@@ -142,7 +143,8 @@ getLocalSizeArgumentFromAnnotation(llvm::Function &F) {
             if (auto *Data =
                     llvm::dyn_cast<llvm::ConstantDataSequential>(AnnotateStr->getInitializer())) {
               if (Data->isString() &&
-                  Data->getAsString().startswith("hipsycl_nd_kernel_local_size_arg")) {
+                  hipsycl::llvmutils::starts_with(Data->getAsString(),
+                                                  "hipsycl_nd_kernel_local_size_arg")) {
                 if (auto *BC = llvm::dyn_cast<llvm::BitCastInst>(UI->getOperand(0)))
                   return {BC->getOperand(0), UI};
                 return {UI->getOperand(0), UI};
@@ -350,6 +352,13 @@ void createLoopsAround(llvm::Function &F, llvm::BasicBlock *AfterBB,
 
   VMap[mergeGVLoadsInEntry(F, LocalIdGlobalNamesRotated[0])] = IndVars[0];
 
+  // in case code references all dimensions, we need to set the remaining dimensions to 0
+  for (size_t D = Dim; D < 3; ++D) {
+    auto ID = mergeGVLoadsInEntry(F, LocalIdGlobalNames[D]);
+    ID->replaceAllUsesWith(Builder.getIntN(Idx->getType()->getIntegerBitWidth(), 0));
+    ID->eraseFromParent();
+  }
+
   VMap[ContiguousIdx] = Idx;
   ContiguousIdx = Idx;
 }
@@ -1310,7 +1319,6 @@ void createLoopsAroundKernel(llvm::Function &F, llvm::DominatorTree &DT, llvm::L
 
   Body = Body->getSingleSuccessor();
 
-
   llvm::SmallVector<llvm::BasicBlock *, 4> ExitBBs;
   llvm::BasicBlock *ExitBB = llvm::BasicBlock::Create(F.getContext(), "exit", &F);
   llvm::IRBuilder<> Bld{ExitBB};
@@ -1351,10 +1359,13 @@ void createLoopsAroundKernel(llvm::Function &F, llvm::DominatorTree &DT, llvm::L
   llvm::remapInstructionsInBlocks(Blocks, VMap);
 
   // remove uses of the undefined global id variables
-  for (int D = 0; D < Dim; ++D)
+  for (int D = 0; D < 3; ++D)
     if (auto *Load =
-            llvm::cast_or_null<llvm::LoadInst>(getLoadForGlobalVariable(F, LocalIdGlobalNames[D])))
+            llvm::cast_or_null<llvm::LoadInst>(mergeGVLoadsInEntry(F, LocalIdGlobalNames[D]))) {
+      if (D >= Dim)
+        Load->replaceAllUsesWith(llvm::ConstantInt::get(Load->getType(), 0));
       Load->eraseFromParent();
+    }
   HIPSYCL_DEBUG_EXECUTE_VERBOSE(F.viewCFG())
 }
 
diff --git a/src/compiler/cbs/VectorizationInfo.cpp b/src/compiler/cbs/VectorizationInfo.cpp
index 72c439776..6c9040cc4 100644
--- a/src/compiler/cbs/VectorizationInfo.cpp
+++ b/src/compiler/cbs/VectorizationInfo.cpp
@@ -17,6 +17,7 @@
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/Instruction.h>
+#include <llvm/IR/Module.h>
 
 using namespace llvm;
 
diff --git a/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp b/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp
index bd8873c3e..16e163fd4 100644
--- a/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp
+++ b/src/compiler/llvm-to-backend/AddressSpaceInferencePass.cpp
@@ -30,6 +30,7 @@
 #include "hipSYCL/common/debug.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceInferencePass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceMap.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 namespace hipsycl {
 namespace compiler {
@@ -132,10 +133,10 @@ llvm::PreservedAnalyses AddressSpaceInferencePass::run(llvm::Module &M,
             forEachUseOfPointerValue(AI, [&](llvm::Value* U){
               if(auto* CB = llvm::dyn_cast<llvm::CallBase>(U)) {
                 llvm::StringRef CalleeName = CB->getCalledFunction()->getName();
-                if(CalleeName.startswith("llvm.lifetime")) {
+                if(llvmutils::starts_with(CalleeName,"llvm.lifetime")) {
                   InstsToRemove.push_back(CB);
 
-                  llvm::Intrinsic::ID Id = CalleeName.startswith("llvm.lifetime.start")
+                  llvm::Intrinsic::ID Id = llvmutils::starts_with(CalleeName, "llvm.lifetime.start")
                                                ? llvm::Intrinsic::lifetime_start
                                                : llvm::Intrinsic::lifetime_end;
 
diff --git a/src/compiler/llvm-to-backend/CMakeLists.txt b/src/compiler/llvm-to-backend/CMakeLists.txt
index 9ccebee24..741c2ef55 100644
--- a/src/compiler/llvm-to-backend/CMakeLists.txt
+++ b/src/compiler/llvm-to-backend/CMakeLists.txt
@@ -26,9 +26,12 @@ function(create_llvm_based_library)
   target_include_directories(${target} PRIVATE
     ${LLVM_TO_BACKEND_INCLUDE_DIRS})
   target_include_directories(${target} SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
-  
-  target_compile_definitions(${target} PRIVATE ${LLVM_DEFINITIONS} -DHIPSYCL_COMPILER_COMPONENT)
-  find_library(LLVM_LIBRARY NAMES LLVM HINTS ${LLVM_LIBRARY_DIRS} NO_DEFAULT_PATH)
+
+  separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
+  target_compile_definitions(${target} PRIVATE
+    ${LLVM_DEFINITIONS_LIST})
+  target_compile_definitions(${target} PRIVATE -DHIPSYCL_COMPILER_COMPONENT)
+  find_library(LLVM_LIBRARY NAMES LLVM LLVM-${LLVM_VERSION_MAJOR} HINTS ${LLVM_LIBRARY_DIRS} NO_DEFAULT_PATH)
   if(NOT LLVM_LIBRARY)
     message(FATAL_ERROR "LLVM at ${LLVM_DIR} does not have libLLVM.so. Please disable SSCP and related backends (-DWITH_SSCP_COMPILER=OFF -DWITH_OPENCL_BACKEND=OFF -DWITH_LEVEL_ZERO_BACKEND=OFF) or choose another LLVM installation")
   endif()
@@ -76,8 +79,11 @@ function(create_llvm_to_backend_tool)
   target_include_directories(${target}-tool PRIVATE
     ${LLVM_TO_BACKEND_INCLUDE_DIRS})
   target_include_directories(${target}-tool SYSTEM PRIVATE ${LLVM_INCLUDE_DIRS})
-  
-  target_compile_definitions(${target}-tool PRIVATE ${LLVM_DEFINITIONS} -DHIPSYCL_TOOL_COMPONENT)
+
+  separate_arguments(LLVM_DEFINITIONS_LIST NATIVE_COMMAND ${LLVM_DEFINITIONS})
+  target_compile_definitions(${target}-tool PRIVATE
+    ${LLVM_DEFINITIONS_LIST})
+  target_compile_definitions(${target}-tool PRIVATE -DHIPSYCL_TOOL_COMPONENT)
   target_link_libraries(${target}-tool PRIVATE ${target})
 
   install(TARGETS ${target}-tool DESTINATION lib/hipSYCL/llvm-to-backend)
@@ -111,9 +117,11 @@ if(WITH_SSCP_COMPILER)
       LLVMToBackend.cpp 
       AddressSpaceInferencePass.cpp
       KnownGroupSizeOptPass.cpp
+      KnownPtrParamAlignmentOptPass.cpp
       GlobalSizesFitInI32OptPass.cpp
       GlobalInliningAttributorPass.cpp
       DeadArgumentEliminationPass.cpp
+      ProcessS2ReflectionPass.cpp
       ../sscp/KernelOutliningPass.cpp)
 
   if(WITH_LLVM_TO_SPIRV)
diff --git a/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp b/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp
index d8362fd62..68d8b8ea4 100644
--- a/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp
+++ b/src/compiler/llvm-to-backend/DeadArgumentEliminationPass.cpp
@@ -13,6 +13,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/Constants.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Transforms/Utils/Cloning.h>
 
 namespace hipsycl {
diff --git a/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp b/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp
index 52671d980..22f6b57fe 100644
--- a/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp
+++ b/src/compiler/llvm-to-backend/GlobalInliningAttributorPass.cpp
@@ -10,6 +10,8 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/llvm-to-backend/GlobalInliningAttributorPass.hpp"
 
+#include <llvm/IR/Module.h>
+
 namespace hipsycl {
 namespace compiler {
 
diff --git a/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp b/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp
index 0faf0fceb..dffdaf95a 100644
--- a/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp
+++ b/src/compiler/llvm-to-backend/GlobalSizesFitInI32OptPass.cpp
@@ -13,6 +13,7 @@
 #include <llvm/IR/Instructions.h>
 #include <llvm/IR/Intrinsics.h>
 #include <llvm/IR/Constants.h>
+#include <llvm/IR/Module.h>
 
 namespace hipsycl {
 namespace compiler {
@@ -124,4 +125,4 @@ llvm::PreservedAnalyses GlobalSizesFitInI32OptPass::run(llvm::Module &M,
   return llvm::PreservedAnalyses::none();
 }
 }
-}
\ No newline at end of file
+}
diff --git a/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp b/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp
index e8fa62887..888415f3d 100644
--- a/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp
+++ b/src/compiler/llvm-to-backend/KnownGroupSizeOptPass.cpp
@@ -13,6 +13,7 @@
 
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
 
 
 namespace hipsycl {
@@ -93,4 +94,4 @@ llvm::PreservedAnalyses KnownGroupSizeOptPass::run(llvm::Module &M,
 
 
 }
-}
\ No newline at end of file
+}
diff --git a/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp b/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp
new file mode 100644
index 000000000..43b3ba05d
--- /dev/null
+++ b/src/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.cpp
@@ -0,0 +1,63 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp"
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+
+namespace hipsycl {
+namespace compiler {
+
+KnownPtrParamAlignmentOptPass::KnownPtrParamAlignmentOptPass(
+    const std::unordered_map<std::string, std::vector<std::pair<int, int>>> &KnownAlignments)
+    : KnownPtrParamAlignments{KnownAlignments} {}
+
+llvm::PreservedAnalyses KnownPtrParamAlignmentOptPass::run(llvm::Module &M,
+                            llvm::ModuleAnalysisManager &MAM) {
+  llvm::Function *AssumeFunc = llvm::Intrinsic::getDeclaration(&M, llvm::Intrinsic::assume);
+
+  for(auto& Entry : KnownPtrParamAlignments) {
+    if(auto* F = M.getFunction(Entry.first)) {
+      int NumParams = F->getFunctionType()->getNumParams();
+
+      if(!F->isDeclaration()) {
+        for(auto& AlignmentInfo : Entry.second) {
+          int ParamIndex = AlignmentInfo.first;
+          if(ParamIndex < NumParams) {
+            llvm::Value* PtrValue = F->getArg(ParamIndex);
+            llvm::Constant *True = llvm::ConstantInt::get(M.getContext(), llvm::APInt(1, 1));
+            llvm::OperandBundleDef AlignBundle{
+                "align", std::vector<llvm::Value *>{
+                             PtrValue, llvm::ConstantInt::get(
+                                           M.getContext(), llvm::APInt(64, AlignmentInfo.second))}};
+
+            llvm::Instruction *InsertionPoint = &(*F->getEntryBlock().getFirstInsertionPt());
+            llvm::CallInst::Create(
+                llvm::FunctionCallee{AssumeFunc}, llvm::ArrayRef<llvm::Value *>{True},
+                llvm::ArrayRef<llvm::OperandBundleDef>{AlignBundle}, "", InsertionPoint);
+          }
+        }
+      }
+    }
+  }
+
+  return llvm::PreservedAnalyses::none(); 
+}
+
+
+}
+}
+
diff --git a/src/compiler/llvm-to-backend/LLVMToBackend.cpp b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
index 1731eba24..3d1cd123f 100644
--- a/src/compiler/llvm-to-backend/LLVMToBackend.cpp
+++ b/src/compiler/llvm-to-backend/LLVMToBackend.cpp
@@ -15,11 +15,15 @@
 #include "hipSYCL/compiler/llvm-to-backend/GlobalInliningAttributorPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/KnownGroupSizeOptPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp"
+#include "hipSYCL/compiler/llvm-to-backend/KnownPtrParamAlignmentOptPass.hpp"
+#include "hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
 #include "hipSYCL/compiler/sscp/KernelOutliningPass.hpp"
 #include "hipSYCL/compiler/utils/ProcessFunctionAnnotationsPass.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
+#include "hipSYCL/sycl/access.hpp"
 
 #include <cstdint>
 
@@ -37,15 +41,97 @@
 #include <llvm/Linker/Linker.h>
 #include <llvm/Passes/PassBuilder.h>
 #include <llvm/Support/Error.h>
+#include <llvm/Support/FileSystem.h>
 #include <llvm/Bitcode/BitcodeWriter.h>
 #include <llvm/Transforms/IPO/AlwaysInliner.h>
 #include <string>
+#include <optional>
+#include <cstdlib>
+#include <sstream>
+#include <unordered_set>
 
 namespace hipsycl {
 namespace compiler {
 
 namespace {
 
+template<class T>
+std::optional<T> getEnvironmentVariable(const std::string& Name) {
+  std::string EnvName = Name;
+  std::transform(EnvName.begin(), EnvName.end(), EnvName.begin(), ::toupper);
+
+  if(const char* EnvVal = std::getenv(("ACPP_S2_"+EnvName).c_str())) {
+    T val;
+    std::stringstream sstr{std::string{EnvVal}};
+    sstr >> val;
+    if (!sstr.fail() && !sstr.bad()) {
+      return val;
+    }
+  }
+  return {};
+}
+
+template<class T>
+T getEnvironmentVariableOrDefault(const std::string& Name,
+                                      const T& Default) {
+  std::optional<T> v = getEnvironmentVariable<T>(Name);
+  if(v.has_value()) {
+    return v.value();
+  }
+  return Default;
+}
+
+void printModuleToFile(llvm::Module& M, const std::string& File,
+                      const std::string& Header){
+
+  // Desired behavior is to truncate files for each application run,
+  // but append content in the dump file within one application run.
+  static std::unordered_set<std::string> UsedFiles;
+  auto OpenFlag = llvm::sys::fs::OpenFlags::OF_Append;
+  if(UsedFiles.find(File) == UsedFiles.end()) {
+    OpenFlag = llvm::sys::fs::OpenFlags::OF_None;
+    UsedFiles.insert(File);
+  }
+
+  std::error_code EC;
+  llvm::raw_fd_ostream Out{File, EC, OpenFlag};
+  Out << ";---------------- Begin AdaptiveCpp IR dump --------------\n";
+  Out << Header;
+  M.print(Out, nullptr);
+  Out << ";----------------- End AdaptiveCpp IR dump ---------------\n";
+}
+
+void enableModuleStateDumping(llvm::Module &M, const std::string &PipelineStage,
+                              const std::string &Kernels) {
+  std::string Filter =
+      getEnvironmentVariableOrDefault<std::string>("DUMP_IR_FILTER", "");
+
+  std::string FallbackFileName = M.getSourceFileName()+".ll";
+  std::string FileName =
+      getEnvironmentVariableOrDefault<std::string>("DUMP_IR_" + PipelineStage, "");
+
+  if(FileName == "1")
+    FileName = FallbackFileName;
+  
+  std::string Header =
+      "; AdaptiveCpp SSCP S2 IR dump; Compiling kernels: " + Kernels + ", stage: " + PipelineStage + "\n";
+
+  if(FileName.length() != 0) {
+    if(Kernels == Filter || Filter.empty())
+      printModuleToFile(M, FileName, Header);
+  }
+
+  std::string AllFileName =
+      getEnvironmentVariableOrDefault<std::string>("DUMP_IR_ALL", "");
+  if(AllFileName == "1")
+    AllFileName = FallbackFileName;
+
+  if(AllFileName.length() != 0 && AllFileName != FileName) {
+    if(Kernels == Filter || Filter.empty())
+      printModuleToFile(M, AllFileName, Header);
+  }
+}
+
 bool linkBitcode(llvm::Module &M, std::unique_ptr<llvm::Module> OtherM,
                    const std::string &ForcedTriple = "",
                    const std::string &ForcedDataLayout = "",
@@ -65,7 +151,7 @@ bool linkBitcode(llvm::Module &M, std::unique_ptr<llvm::Module> OtherM,
 void setFastMathFunctionAttribs(llvm::Module& M) {
   auto forceAttr = [&](llvm::Function& F, llvm::StringRef Key, llvm::StringRef Value) {
     if(F.hasFnAttribute(Key)) {
-      if(!F.getFnAttribute(Key).getValueAsString().equals(Value))
+      if(F.getFnAttribute(Key).getValueAsString() != Value)
         F.removeFnAttr(Key);
     }
     F.addFnAttr(Key, Value);
@@ -99,8 +185,8 @@ class InstructionCleanupPass : public llvm::PassInfoMixin<InstructionCleanupPass
             // these instructions can sometimes appear as a byproduct of some transformations
             // even without dynamic allocas, but they are generally unsupported on device
             // backends.
-            if (CB->getCalledFunction()->getName().startswith("llvm.stacksave") ||
-                CB->getCalledFunction()->getName().startswith("llvm.stackrestore"))
+            if (llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.stacksave") ||
+                llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.stackrestore"))
               CallsToRemove.push_back(CB);
           }
         }
@@ -217,52 +303,93 @@ bool LLVMToBackendTranslator::fullTransformation(const std::string &LLVMIR, std:
 }
 
 bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
-  HIPSYCL_DEBUG_INFO << "LLVMToBackend: Preparing backend flavoring...\n";
+  enableModuleStateDumping(M, "input", getCompilationIdentifier());
 
-  if(!this->prepareBackendFlavor(M))
-    return false;
+  HIPSYCL_DEBUG_INFO << "LLVMToBackend: Preparing backend flavoring...\n";
 
-  // We need to resolve symbols now instead of after optimization, because we
-  // may have to reuotline if the code that is linked in after symbol resolution
-  // depends on IR constants.
-  // This also means that we cannot error yet if we cannot resolve all symbols :(
-  resolveExternalSymbols(M);
+  return withPassBuilderAndMAM([&](llvm::PassBuilder &PB, llvm::ModuleAnalysisManager &MAM) {
+    PassHandler PH {&PB, &MAM};
 
-  HIPSYCL_DEBUG_INFO << "LLVMToBackend: Applying specializations and S2 IR constants...\n";
-  for(auto& A : SpecializationApplicators) {
-    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Processing specialization " << A.first << "\n";
-    A.second(M);
-  }
-  // Return error in case applying specializations has caused error list to be populated
-  if(!Errors.empty())
-    return false;
+    // Do an initial outlining to simplify the code, particularly to reduce
+    // linking complexity if --acpp-export-all is used
+    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Reoutlining kernels...\n";
+    // Function call specializations are only handled at a later stage,
+    // so if the user has requested any, ensure that we don't throw them away
+    // since these functions will not yet appear in the call graph.
+    std::vector<std::string> InitialOutliningEntrypoints = OutliningEntrypoints;
+    for(const auto& FName : FunctionCallSpecializationOutliningEntrypoints)
+      InitialOutliningEntrypoints.push_back(FName);
+    KernelOutliningPass InitialOutlining{InitialOutliningEntrypoints};
+    InitialOutlining.run(M, MAM);
+    enableModuleStateDumping(M, "initial_outlining", getCompilationIdentifier());
+    // We need to resolve symbols now instead of after optimization, because we
+    // may have to reoutline if the code that is linked in after symbol resolution
+    // depends on IR constants.
+    // This also means that we cannot error yet if we cannot resolve all symbols :(
+    resolveExternalSymbols(M);
+
+    if(!this->prepareBackendFlavor(M))
+      return false;
+
+    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Applying specializations and S2 IR constants...\n";
+    for(auto& A : SpecializationApplicators) {
+      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Processing specialization " << A.first << "\n";
+      A.second(M);
+    }
+    // Return error in case applying specializations has caused error list to be populated
+    if(!Errors.empty())
+      return false;
+    
+    enableModuleStateDumping(M, "specialization", getCompilationIdentifier());
 
-  bool ContainsUnsetIRConstants = false;
-  bool FlavoringSuccessful = false;
-  bool OptimizationSuccessful = false;
+    // Process stage 2 reflection calls
+    ReflectionFields["compiler_backend"] = this->getBackendId();
+    for(const auto& Fields : ReflectionFields) {
+      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Setting up reflection fields: " << Fields.first << " = "
+                         << Fields.second << "\n";
+    }
+    ProcessS2ReflectionPass S2RP{ReflectionFields};
+    S2RP.run(M, MAM);
 
-  constructPassBuilderAndMAM([&](llvm::PassBuilder &PB, llvm::ModuleAnalysisManager &MAM) {
-    PassHandler PH {&PB, &MAM};
+    enableModuleStateDumping(M, "reflection", getCompilationIdentifier());
 
     // Optimize away unnecessary branches due to backend-specific S2IR constants
     // This is what allows us to specialize code for different backends.
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing branches post S2 IR constant application...\n";
     IRConstant::optimizeCodeAfterConstantModification(M, MAM);
+
     // Rerun kernel outlining pass so that we don't include unneeded functions
     // that are specific to other backends.
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Reoutlining kernels...\n";
     KernelOutliningPass KP{OutliningEntrypoints};
     KP.run(M, MAM);
 
+    for(auto& P : NoAliasParameters) {
+      auto* F = M.getFunction(P.first);
+      if(F) {
+        for(int i : P.second) {
+          HIPSYCL_DEBUG_INFO << "LLVMToBackend: Attaching noalias attribute to parameter " << i
+                             << " of kernel " << P.first << "\n";
+          if(i < F->getFunctionType()->getNumParams())
+            if(!F->hasParamAttribute(i, llvm::Attribute::AttrKind::NoAlias))
+              F->addParamAttr(i, llvm::Attribute::AttrKind::NoAlias);
+        }
+      }
+    }
+
     // These optimizations should be run before __acpp_sscp_* builtins
     // are resolved, so before backend bitcode libraries are linked. We thus
     // run them prior to flavoring.
     KnownGroupSizeOptPass GroupSizeOptPass{KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ};
     GlobalSizesFitInI32OptPass SizesAsIntOptPass{GlobalSizesFitInInt, KnownGroupSizeX,
                                                  KnownGroupSizeY, KnownGroupSizeZ};
+
     GroupSizeOptPass.run(M, MAM);
     SizesAsIntOptPass.run(M, MAM);
 
+    KnownPtrParamAlignmentOptPass KnownAlignmentOptPass{KnownPtrParamAlignments};
+    KnownAlignmentOptPass.run(M, MAM);
+
     // Before optimizing, make sure everything has internal linkage to
     // help inlining. All linking should have occured by now, except
     // for backend builtin libraries like libdevice etc
@@ -279,56 +406,68 @@ bool LLVMToBackendTranslator::prepareIR(llvm::Module &M) {
     InstructionCleanupPass ICP;
     ICP.run(M, MAM);
 
+    enableModuleStateDumping(M, "jit_optimizations", getCompilationIdentifier());
+
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Adding backend-specific flavor to IR...\n";
-    FlavoringSuccessful = this->toBackendFlavor(M, PH);
+    if(!this->toBackendFlavor(M, PH)) {
+      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Flavoring failed\n";
+      return false;
+    }
+
+    enableModuleStateDumping(M, "backend_flavoring", getCompilationIdentifier());
+    // Run again to resolve reflection inside builtins
+    S2RP.run(M, MAM);
+    enableModuleStateDumping(M, "builtin_reflection", getCompilationIdentifier());
+
     // Inline again to handle builtin definitions pulled in by backend flavors
     InliningPass.run(M, MAM);
 
-    if(FlavoringSuccessful) {
-      // Run optimizations
-      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing flavored IR...\n";
+    // Run optimizations
+    HIPSYCL_DEBUG_INFO << "LLVMToBackend: Optimizing flavored IR...\n";
 
-      if(IsFastMath)
-        setFastMathFunctionAttribs(M);
+    if(IsFastMath)
+      setFastMathFunctionAttribs(M);
 
-      // Remove argument_used hints, which are no longer needed once we enter optimization stage.
-      // This is primarily needed for dynamic functions.
-      utils::ProcessFunctionAnnotationPass PFA({"argument_used"});
-      PFA.run(M, MAM);
+    // Remove argument_used hints, which are no longer needed once we enter optimization stage.
+    // This is primarily needed for dynamic functions.
+    utils::ProcessFunctionAnnotationPass PFA({"argument_used"});
+    PFA.run(M, MAM);
 
-      MAM.clear();
+    MAM.clear(); 
 
-      OptimizationSuccessful = optimizeFlavoredIR(M, PH);
+    if(!optimizeFlavoredIR(M, PH)) {
+      this->registerError("LLVMToBackend: Optimization failed");
+      return false;
+    }
 
-      if(!OptimizationSuccessful) {
-        this->registerError("LLVMToBackend: Optimization failed");
+    for(const auto& Entry : FunctionsForDeadArgumentElimination) {
+      if(auto* F = M.getFunction(Entry.first)) {
+        if(isKernelAfterFlavoring(*F)) {
+          runKernelDeadArgumentElimination(M, F, PH, *Entry.second);
+        }
       }
+    }
+    llvm::AlwaysInlinerPass{}.run(M, MAM);
 
-      for(const auto& Entry : FunctionsForDeadArgumentElimination) {
-        if(auto* F = M.getFunction(Entry.first)) {
-          if(isKernelAfterFlavoring(*F)) {
-            runKernelDeadArgumentElimination(M, F, PH, *Entry.second);
-          }
+    enableModuleStateDumping(M, "full_optimizations", getCompilationIdentifier());
+    
+    enableModuleStateDumping(M, "final", getCompilationIdentifier());
+
+    bool ContainsUnsetIRConstants = false;
+    S2IRConstant::forEachS2IRConstant(M, [&](S2IRConstant C) {
+      if (C.isValid()) {
+        if (!C.isInitialized()) {
+          ContainsUnsetIRConstants = true;
+          this->registerError("LLVMToBackend: AdaptiveCpp S2IR constant was not set: " +
+                              C.getGlobalVariable()->getName().str());
         }
       }
-      llvm::AlwaysInlinerPass AIP;
-      AIP.run(M, MAM);
-
-      S2IRConstant::forEachS2IRConstant(M, [&](S2IRConstant C) {
-        if (C.isValid()) {
-          if (!C.isInitialized()) {
-            ContainsUnsetIRConstants = true;
-            this->registerError("LLVMToBackend: AdaptiveCpp S2IR constant was not set: " +
-                                C.getGlobalVariable()->getName().str());
-          }
-        }
-      });
-    } else {
-      HIPSYCL_DEBUG_INFO << "LLVMToBackend: Flavoring failed\n";
-    }
-  });
+    });
+    if(ContainsUnsetIRConstants)
+      return false;
 
-  return FlavoringSuccessful && OptimizationSuccessful && !ContainsUnsetIRConstants;
+    return true;
+  });
 }
 
 bool LLVMToBackendTranslator::translatePreparedIR(llvm::Module &FlavoredModule, std::string &out) {
@@ -342,6 +481,16 @@ bool LLVMToBackendTranslator::optimizeFlavoredIR(llvm::Module& M, PassHandler& P
 
   // silence optimization remarks,..
   M.getContext().setDiagnosticHandlerCallBack(
+#if LLVM_VERSION_MAJOR >= 19
+      [](const llvm::DiagnosticInfo *DI, void *Context) {
+        llvm::DiagnosticPrinterRawOStream DP(llvm::errs());
+        if (DI->getSeverity() == llvm::DS_Error) {
+          llvm::errs() << "LLVMToBackend: Error: ";
+          DI->print(DP);
+          llvm::errs() << "\n";
+        }
+      });
+#else
       [](const llvm::DiagnosticInfo &DI, void *Context) {
         llvm::DiagnosticPrinterRawOStream DP(llvm::errs());
         if (DI.getSeverity() == llvm::DS_Error) {
@@ -350,6 +499,7 @@ bool LLVMToBackendTranslator::optimizeFlavoredIR(llvm::Module& M, PassHandler& P
           llvm::errs() << "\n";
         }
       });
+#endif
 
   llvm::ModulePassManager MPM =
       PH.PassBuilder->buildPerModuleDefaultPipeline(llvm::OptimizationLevel::O3);
@@ -399,13 +549,6 @@ bool LLVMToBackendTranslator::linkBitcodeFile(llvm::Module &M, const std::string
                            LinkOnlyNeeded);
 }
 
-void LLVMToBackendTranslator::setS2IRConstant(const std::string &name, const void *ValueBuffer) {
-  SpecializationApplicators[name] = [=](llvm::Module& M){
-    S2IRConstant C = S2IRConstant::getFromConstantName(M, name);
-    C.set(ValueBuffer);
-  };
-}
-
 void LLVMToBackendTranslator::specializeKernelArgument(const std::string &KernelName, int ParamIndex,
                                 const void *ValueBuffer) {
   std::string Id = KernelName+"__specialized_kernel_argument_"+std::to_string(ParamIndex);
@@ -472,6 +615,11 @@ void LLVMToBackendTranslator::specializeKernelArgument(const std::string &Kernel
 void LLVMToBackendTranslator::specializeFunctionCalls(
     const std::string &FuncName, const std::vector<std::string> &ReplacementCalls,
     bool OverrideOnlyUndefined) {
+
+  for(const auto& FName : ReplacementCalls) {
+    this->FunctionCallSpecializationOutliningEntrypoints.push_back(FName);
+  }
+
   std::string Id = "__specialized_function_call_"+FuncName;
   SpecializationApplicators[Id] = [=](llvm::Module &M) {
     HIPSYCL_DEBUG_INFO << "LLVMToBackend: Specializing function calls to " << FuncName << " to:\n";
@@ -543,6 +691,10 @@ void LLVMToBackendTranslator::specializeFunctionCalls(
   };
 }
 
+void LLVMToBackendTranslator::setNoAliasKernelParam(const std::string &KernelName, int ParamIndex) {
+  NoAliasParameters[KernelName].push_back(ParamIndex);
+}
+
 void LLVMToBackendTranslator::provideExternalSymbolResolver(ExternalSymbolResolver Resolver) {
   this->SymbolResolver = Resolver;
   this->HasExternalSymbolResolver = true;
@@ -654,6 +806,32 @@ void LLVMToBackendTranslator::runKernelDeadArgumentElimination(
                        << "\n";
   }
 }
+
+void LLVMToBackendTranslator::setKnownPtrParamAlignment(const std::string &FunctionName,
+                                                        int ParamIndex, int Alignment) {
+  for (auto &Entry : KnownPtrParamAlignments[FunctionName]) {
+    if (Entry.first == ParamIndex) {
+      Entry.second = Alignment;
+      return;
+    }
+  }
+  KnownPtrParamAlignments[FunctionName].push_back(std::make_pair(ParamIndex, Alignment));
+}
+
+void LLVMToBackendTranslator::setReflectionField(const std::string &str, uint64_t value) {
+  ReflectionFields[str] = value;
+}
+
+std::string LLVMToBackendTranslator::getCompilationIdentifier() const {
+  std::string Result;
+  for(const auto& K : Kernels) {
+    Result += "<Kernel:"+K+">";
+  }
+  if(Result.empty())
+    return "<no-kernels>";
+  return Result;
+}
+
 }
 }
 
diff --git a/src/compiler/llvm-to-backend/ProcessS2ReflectionPass.cpp b/src/compiler/llvm-to-backend/ProcessS2ReflectionPass.cpp
new file mode 100644
index 000000000..6605eee35
--- /dev/null
+++ b/src/compiler/llvm-to-backend/ProcessS2ReflectionPass.cpp
@@ -0,0 +1,104 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/compiler/llvm-to-backend/ProcessS2ReflectionPass.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Constants.h>
+
+#include <algorithm>
+#include <cctype>
+
+namespace hipsycl {
+namespace compiler {
+
+namespace {
+
+void handleReflectionFunction(llvm::Module& M, llvm::Function& F, uint64_t Value) {
+  F.setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
+  auto *ReplacementValue = llvm::ConstantInt::get(
+      M.getContext(), llvm::APInt{F.getReturnType()->getIntegerBitWidth(), Value});
+  
+  llvm::SmallVector<llvm::CallBase*> CallsToRemove;
+  for(auto* U : F.users()) {
+    if(auto* CB = llvm::dyn_cast<llvm::CallBase>(U)){
+      CB->replaceNonMetadataUsesWith(ReplacementValue);
+      CallsToRemove.push_back(CB);
+    }
+  }
+  for (auto *C : CallsToRemove) {
+    C->replaceAllUsesWith(llvm::UndefValue::get(C->getType()));
+    C->dropAllReferences();
+    C->eraseFromParent();
+  }
+}
+
+std::string getQueryName(llvm::StringRef FunctionName, const std::string& Prefix) {
+  auto Pos = FunctionName.find(Prefix);
+  if(Pos == std::string::npos)
+    return {};
+
+  return FunctionName.str().substr(Pos+Prefix.length());
+}
+
+}
+
+ProcessS2ReflectionPass::ProcessS2ReflectionPass(
+    const std::unordered_map<std::string, uint64_t> &Fields) {
+
+  for(const auto& KV : Fields) {
+    std::string CanonicalizedKey = KV.first;
+
+    std::transform(CanonicalizedKey.begin(), CanonicalizedKey.end(), CanonicalizedKey.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+
+    for(auto& c : CanonicalizedKey)
+      if(!std::isalnum(c) && c != '_')
+        c='_';
+
+    SupportedFields[CanonicalizedKey] = KV.second;
+  }
+}
+
+llvm::PreservedAnalyses ProcessS2ReflectionPass::run(llvm::Module& M, llvm::ModuleAnalysisManager& MAM) {
+
+
+  auto processReflectionCalls = [&](const std::string &QueryPrefix,
+                                    const std::string &KnowsQueryPrefix) {
+    for(auto& F : M) {
+      // Note: The order of the if/else branch here assumes that
+      // QueryPrefix is a substring of KnowsQueryPrefix!
+      if(llvmutils::starts_with(F.getName(), KnowsQueryPrefix)) {
+        auto QueryName = getQueryName(F.getName(), KnowsQueryPrefix);
+        auto It = SupportedFields.find(QueryName);
+        if(It != SupportedFields.end())
+          handleReflectionFunction(M, F, 1);
+        else
+          handleReflectionFunction(M, F, 0);
+      } else if(llvmutils::starts_with(F.getName(), QueryPrefix)) {
+        auto QueryName = getQueryName(F.getName(), QueryPrefix);
+        auto It = SupportedFields.find(QueryName);
+        if(It != SupportedFields.end())
+          handleReflectionFunction(M, F, It->second);
+      } 
+    }
+  };
+
+  processReflectionCalls("__acpp_sscp_jit_reflect_", "__acpp_sscp_jit_reflect_knows_");
+  processReflectionCalls("__acpp_sscp_s2_reflect_", "__acpp_sscp_s2_reflect_knows_");
+
+  return llvm::PreservedAnalyses::none();
+}
+
+}
+}
\ No newline at end of file
diff --git a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
index 6be898ae6..54d5a0c4b 100644
--- a/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
+++ b/src/compiler/llvm-to-backend/amdgpu/LLVMToAmdgpu.cpp
@@ -12,7 +12,8 @@
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceInferencePass.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
 #include <llvm/IR/DataLayout.h>
@@ -153,7 +154,7 @@ class RocmDeviceLibs {
       Invocation.push_back("-fno-hip-fp32-correctly-rounded-divide-sqrt");
     }
     
-    if(!llvm::StringRef{ClangPath}.endswith("hipcc")) {
+    if(!llvmutils::ends_with(llvm::StringRef{ClangPath}, "hipcc")) {
       // Normally we try to use hipcc. However, when that fails,
       // we may have fallen back to clang. In that case we may
       // have to additionally set --rocm-path and --rocm-device-lib-path.
@@ -208,16 +209,20 @@ class RocmDeviceLibs {
 };
 
 LLVMToAmdgpuTranslator::LLVMToAmdgpuTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::amdgpu, KN, KN}, KernelNames{KN} {
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::amdgpu), KN, KN},
+      KernelNames{KN} {
   RocmDeviceLibsPath = common::filesystem::join_path(RocmPath,
                                                      std::vector<std::string>{"amdgcn", "bitcode"});
 }
 
-
 bool LLVMToAmdgpuTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
   
   M.setTargetTriple(TargetTriple);
-#if LLVM_VERSION_MAJOR >= 17
+#if LLVM_VERSION_MAJOR >= 18
+  M.setDataLayout("e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:"
+                  "32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:"
+                  "256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9");
+#elif LLVM_VERSION_MAJOR >= 17
   M.setDataLayout(
       "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-"
       "i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-"
diff --git a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
index 932702208..deb97b477 100644
--- a/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
+++ b/src/compiler/llvm-to-backend/host/HostKernelWrapperPass.cpp
@@ -37,6 +37,9 @@ namespace hipsycl {
 namespace compiler {
 
 namespace {
+
+constexpr llvm::StringRef PassPrefix = "[SSCP][HostKernelWrapper] ";
+
 llvm::StoreInst *storeToGlobalVar(llvm::IRBuilderBase Bld, llvm::Value *V,
                                   llvm::StringRef GlobalVarName) {
   auto M = Bld.GetInsertBlock()->getModule();
@@ -46,21 +49,7 @@ llvm::StoreInst *storeToGlobalVar(llvm::IRBuilderBase Bld, llvm::Value *V,
 }
 
 void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm::Value *To) {
-  auto M = F.getParent();
-  auto GV = M->getGlobalVariable(GlobalVarName);
-  if (!GV)
-    return;
-
-  HIPSYCL_DEBUG_INFO << "[SSCP][HostKernelWrapper] RUOGVW: " << *GV << " with " << *To << "\n";
-  llvm::SmallVector<llvm::Instruction *> ToErase;
-  for (auto U : GV->users()) {
-    if (auto I = llvm::dyn_cast<llvm::LoadInst>(U); I && I->getFunction() == &F) {
-      HIPSYCL_DEBUG_INFO << "[SSCP][HostKernelWrapper] RUOGVW: " << *I << " with " << *To << "\n";
-      I->replaceAllUsesWith(To);
-    }
-  }
-  for (auto I : ToErase)
-    I->eraseFromParent();
+  utils::replaceUsesOfGVWith(F, GlobalVarName, To, PassPrefix);
 }
 
 /*
@@ -74,7 +63,8 @@ void replaceUsesOfGVWith(llvm::Function &F, llvm::StringRef GlobalVarName, llvm:
  * This makes calling the kernel from the host code straighforward, as only the work group info
  * struct and the user arguments need to be passed to the wrapper.
  */
-llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocalMemSize) {
+llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocalMemSize,
+                                    const std::array<int, 3> &KnownWgSize) {
   auto M = F.getParent();
   auto &Ctx = M->getContext();
 
@@ -85,7 +75,8 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
       llvm::StructType::get(llvm::ArrayType::get(SizeT, 3),                 // # groups
                             llvm::ArrayType::get(SizeT, 3),                 // group id
                             llvm::ArrayType::get(SizeT, 3),                 // local size
-                            llvm::PointerType::getUnqual(Bld.getInt8Ty())); // local memory size
+                            llvm::PointerType::getUnqual(Bld.getInt8Ty()), // local memory ptr
+                            llvm::PointerType::getUnqual(Bld.getInt8Ty())); // internal local memory ptr
   auto VoidPtrT = llvm::PointerType::getUnqual(Bld.getInt8Ty());
   auto UserArgsT = llvm::PointerType::getUnqual(VoidPtrT);
 
@@ -135,6 +126,11 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
         llvm::LLVMContext::MD_dereferenceable,
         llvm::MDNode::get(Ctx, {llvm::ConstantAsMetadata::get(Bld.getInt64(DynamicLocalMemSize))}));
 
+  auto InternalLocalMemPtr = Bld.CreateLoad(
+      VoidPtrT,
+      Bld.CreateInBoundsGEP(WorkGroupInfoT, Wrapper->getArg(0), {Bld.getInt64(0), Bld.getInt32(4)}),
+      "internal_local_mem_ptr");
+
   llvm::SmallVector<llvm::Value *> Args;
 
   auto ArgArray = Wrapper->arg_begin() + 1;
@@ -165,9 +161,16 @@ llvm::Function *makeWrapperFunction(llvm::Function &F, std::int64_t DynamicLocal
   for (int I = 0; I < 3; ++I) {
     replaceUsesOfGVWith(*Wrapper, cbs::NumGroupsGlobalNames[I], NumGroups[I]);
     replaceUsesOfGVWith(*Wrapper, cbs::GroupIdGlobalNames[I], GroupIds[I]);
-    replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I], LocalSize[I]);
+    if (KnownWgSize[I] != 0) {
+      replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I],
+                                 llvm::ConstantInt::get(SizeT, KnownWgSize[I]));
+    } else {
+      replaceUsesOfGVWith(*Wrapper, cbs::LocalSizeGlobalNames[I], LocalSize[I]);
+    }
   }
+
   replaceUsesOfGVWith(*Wrapper, cbs::SscpDynamicLocalMemoryPtrName, LocalMemPtr);
+  replaceUsesOfGVWith(*Wrapper, cbs::SscpInternalLocalMemoryPtrName, InternalLocalMemPtr);
 
   F.setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage);
   F.replaceAllUsesWith(Wrapper);
@@ -186,10 +189,9 @@ llvm::PreservedAnalyses HostKernelWrapperPass::run(llvm::Function &F,
   if (!SAA || !SAA->isKernelFunc(&F))
     return llvm::PreservedAnalyses::all();
 
-  auto Wrapper = makeWrapperFunction(F, DynamicLocalMemSize);
+  auto Wrapper = makeWrapperFunction(F, DynamicLocalMemSize, KnownWgSize);
 
-  HIPSYCL_DEBUG_INFO << "[SSCP][HostKernelWrapper] Created kernel wrapper: " << Wrapper->getName()
-                     << "\n";
+  HIPSYCL_DEBUG_INFO << PassPrefix << "Created kernel wrapper: " << Wrapper->getName() << "\n";
 
   return llvm::PreservedAnalyses::none();
 }
diff --git a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
index 8b776d9dc..0a9dae7f2 100644
--- a/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
+++ b/src/compiler/llvm-to-backend/host/LLVMToHost.cpp
@@ -22,7 +22,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/host/HostKernelWrapperPass.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 
 #include <llvm/ADT/SmallVector.h>
 #include <llvm/Bitcode/BitcodeWriter.h>
@@ -59,7 +59,8 @@ namespace hipsycl {
 namespace compiler {
 
 LLVMToHostTranslator::LLVMToHostTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::host, KN, KN}, KernelNames{KN} {}
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::host), KN, KN},
+      KernelNames{KN} {}
 
 bool LLVMToHostTranslator::toBackendFlavor(llvm::Module &M, PassHandler &PH) {
 
@@ -100,7 +101,7 @@ bool LLVMToHostTranslator::toBackendFlavor(llvm::Module &M, PassHandler &PH) {
   registerCBSPipeline(MPM, hipsycl::compiler::OptLevel::O3, true);
 
   llvm::FunctionPassManager FPM;
-  FPM.addPass(HostKernelWrapperPass{KnownLocalMemSize});
+  FPM.addPass(HostKernelWrapperPass{KnownLocalMemSize, KnownGroupSizeX, KnownGroupSizeY, KnownGroupSizeZ});
   MPM.addPass(llvm::createModuleToFunctionPassAdaptor(std::move(FPM)));
 
   MPM.run(M, *PH.ModuleAnalysisManager);
diff --git a/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp b/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
index aa0ba57a3..74ea4f546 100644
--- a/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
+++ b/src/compiler/llvm-to-backend/ptx/LLVMToPtx.cpp
@@ -13,7 +13,7 @@
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/AddressSpaceInferencePass.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
 #include <llvm/ADT/SmallVector.h>
@@ -155,7 +155,8 @@ void replaceBrokenLLVMIntrinsics(llvm::Module& M) {
 }
 
 LLVMToPtxTranslator::LLVMToPtxTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::ptx, KN, KN}, KernelNames{KN} {}
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::ptx), KN, KN},
+      KernelNames{KN} {}
 
 bool LLVMToPtxTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
   std::string Triple = "nvptx64-nvidia-cuda";
diff --git a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
index c41c44498..e50040bb7 100644
--- a/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
+++ b/src/compiler/llvm-to-backend/spirv/LLVMToSpirv.cpp
@@ -14,7 +14,8 @@
 #include "hipSYCL/compiler/llvm-to-backend/LLVMToBackend.hpp"
 #include "hipSYCL/compiler/llvm-to-backend/Utils.hpp"
 #include "hipSYCL/compiler/sscp/IRConstantReplacer.hpp"
-#include "hipSYCL/glue/llvm-sscp/s2_ir_constants.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
+#include "hipSYCL/glue/llvm-sscp/jit-reflection/queries.hpp"
 #include "hipSYCL/common/filesystem.hpp"
 #include "hipSYCL/common/debug.hpp"
 #include <llvm/IR/Instructions.h>
@@ -122,7 +123,8 @@ void assignSPIRCallConvention(llvm::Function *F) {
 }
 
 LLVMToSpirvTranslator::LLVMToSpirvTranslator(const std::vector<std::string> &KN)
-    : LLVMToBackendTranslator{sycl::jit::backend::spirv, KN, KN}, KernelNames{KN} {}
+    : LLVMToBackendTranslator{static_cast<int>(sycl::AdaptiveCpp_jit::compiler_backend::spirv), KN, KN},
+      KernelNames{KN} {}
 
 bool LLVMToSpirvTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
   
@@ -193,8 +195,8 @@ bool LLVMToSpirvTranslator::toBackendFlavor(llvm::Module &M, PassHandler& PH) {
           // llvm-spirv translator does not like llvm.lifetime.start/end operate on generic
           // pointers.
           auto* CalledF = CB->getCalledFunction();
-          if (CalledF->getName().startswith("llvm.lifetime.start") ||
-              CalledF->getName().startswith("llvm.lifetime.end")) {
+          if (llvmutils::starts_with(CalledF->getName(), "llvm.lifetime.start") ||
+              llvmutils::starts_with(CalledF->getName(), "llvm.lifetime.end")) {
             if(CB->getNumOperands() > 1 && CB->getArgOperand(1)->getType()->isPointerTy())
               if (CB->getArgOperand(1)->getType()->getPointerAddressSpace() ==
                   ASMap[AddressSpace::Generic])
diff --git a/src/compiler/sscp/KernelOutliningPass.cpp b/src/compiler/sscp/KernelOutliningPass.cpp
index d62d348db..8cb5d328d 100644
--- a/src/compiler/sscp/KernelOutliningPass.cpp
+++ b/src/compiler/sscp/KernelOutliningPass.cpp
@@ -257,6 +257,9 @@ void canonicalizeKernelParameters(llvm::Function* F, llvm::Module& M) {
 
 }
 
+EntrypointPreparationPass::EntrypointPreparationPass(bool ExportByDefault)
+: ExportAll{ExportByDefault} {}
+
 llvm::PreservedAnalyses
 EntrypointPreparationPass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM) {
 
@@ -265,6 +268,24 @@ EntrypointPreparationPass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM)
 
   llvm::SmallSet<std::string, 16> Kernels;
 
+
+  llvm::DenseSet<llvm::Function*> MarkedFunctions;
+  auto MarkThisFunctionForOutlining = [&](llvm::Function* F) {
+    HIPSYCL_DEBUG_INFO << "Found SSCP outlining entrypoint: " << F->getName() << "\n";
+    // Make kernel have external linkage to avoid having everything optimized away
+    F->setLinkage(llvm::GlobalValue::ExternalLinkage);
+
+    // If we have a definition, we need to perform outlining.
+    // Otherwise, we would need to treat the function as imported --
+    // however this cannot really happen as clang does not codegen our
+    // attribute((annotate("hipsycl_sscp_outlining"))) for declarations
+    // without definition.
+    if(F->size() > 0 && !MarkedFunctions.contains(F)) {
+      this->OutliningEntrypoints.push_back(F->getName().str());
+      MarkedFunctions.insert(F);
+    }
+  };
+
   utils::findFunctionsWithStringAnnotationsWithArg(M, [&](llvm::Function* F, llvm::StringRef Annotation, llvm::Constant* Argument){
     if(F) {
       if(Annotation.compare(SscpKernelDimensionName) == 0){
@@ -295,22 +316,20 @@ EntrypointPreparationPass::run(llvm::Module &M, llvm::ModuleAnalysisManager &AM)
         this->KernelNames.push_back(F->getName().str());
         Kernels.insert(F->getName().str());
       }
+
       if(Annotation.compare(SSCPOutliningMarker) == 0) {
-        HIPSYCL_DEBUG_INFO << "Found SSCP outlining entrypoint: " << F->getName() << "\n";
-        // Make kernel have external linkage to avoid having everything optimized away
-        F->setLinkage(llvm::GlobalValue::ExternalLinkage);
-
-        // If we have a definition, we need to perform outlining.
-        // Otherwise, we would need to treat the function as imported --
-        // however this cannot really happen as clang does not codegen our
-        // attribute((annotate("hipsycl_sscp_outlining"))) for declarations
-        // without definition.
-        if(F->size() > 0)
-          this->OutliningEntrypoints.push_back(F->getName().str());
+        MarkThisFunctionForOutlining(F);
       }
     }
   });
 
+  if(ExportAll) {
+    for(auto& F: M) {
+      if (!F.isIntrinsic() && F.getLinkage() != llvm::GlobalValue::LinkageTypes::InternalLinkage)
+        MarkThisFunctionForOutlining(&F);
+    }
+  }
+
 
   for(const auto& EP : OutliningEntrypoints) {
     if(!Kernels.contains(EP)) {
diff --git a/src/compiler/sscp/StdAtomicRemapperPass.cpp b/src/compiler/sscp/StdAtomicRemapperPass.cpp
index dad04c16a..9f0ab6436 100644
--- a/src/compiler/sscp/StdAtomicRemapperPass.cpp
+++ b/src/compiler/sscp/StdAtomicRemapperPass.cpp
@@ -14,6 +14,7 @@
 
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
 #include <llvm/Support/AtomicOrdering.h>
 
 #include <string>
diff --git a/src/compiler/sscp/StdBuiltinRemapperPass.cpp b/src/compiler/sscp/StdBuiltinRemapperPass.cpp
index b36808902..ce633c989 100644
--- a/src/compiler/sscp/StdBuiltinRemapperPass.cpp
+++ b/src/compiler/sscp/StdBuiltinRemapperPass.cpp
@@ -10,9 +10,17 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/sscp/StdBuiltinRemapperPass.hpp"
 #include "hipSYCL/common/debug.hpp"
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+
 #include <unordered_set>
 #include <unordered_map>
 
+#define STRINGIFY2(x) #x
+#define STRINGIFY(x) STRINGIFY2(x)
+
+
 namespace hipsycl {
 namespace compiler {
 
@@ -54,7 +62,34 @@ using builtin_mapping = std::array<const char*, 2>;
 static constexpr std::array explicitly_mapped_builtins = {
   // clang sometimes (e.g. -ffast-math) these builtins
   builtin_mapping{"__powisf2", "__acpp_sscp_pown_f32"},
-  builtin_mapping{"__powidf2", "__acpp_sscp_pown_f64"}
+  builtin_mapping{"__powidf2", "__acpp_sscp_pown_f64"},
+
+#define ACPP_DECLARE_FINITE_BUILTIN_MAPPING(name) \
+  builtin_mapping{STRINGIFY(__ ## name ## f_finite), STRINGIFY(__acpp_sscp_ ## name ## _f32)}, \
+  builtin_mapping{STRINGIFY(__ ## name ## _finite), STRINGIFY(__acpp_sscp_ ## name ## _f64)}
+
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(acos),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(acosh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(asin),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(asinh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(atan2),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(atanh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(cosh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(cos),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(exp10),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(exp2),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(exp),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(fmod),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(log10),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(log2),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(log),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(hypot),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(pow),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(remainder),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(sinh),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(sin),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(sqrt),
+  ACPP_DECLARE_FINITE_BUILTIN_MAPPING(tan)
 };
 
 llvm::PreservedAnalyses StdBuiltinRemapperPass::run(llvm::Module &M,
diff --git a/src/compiler/sscp/TargetSeparationPass.cpp b/src/compiler/sscp/TargetSeparationPass.cpp
index 7bf0522c1..6cd9e9cb3 100644
--- a/src/compiler/sscp/TargetSeparationPass.cpp
+++ b/src/compiler/sscp/TargetSeparationPass.cpp
@@ -19,6 +19,7 @@
 #include "hipSYCL/compiler/CompilationState.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
 #include "hipSYCL/compiler/utils/ProcessFunctionAnnotationsPass.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 #include "hipSYCL/common/hcf_container.hpp"
 
 #include <cstddef>
@@ -98,13 +99,18 @@ class ScopedPrintingTimer : private Timer {
 
 static llvm::cl::opt<bool> SSCPEmitHcf{
     "acpp-sscp-emit-hcf", llvm::cl::init(false),
-    llvm::cl::desc{"Emit HCF from hipSYCL LLVM SSCP compilation flow"}};
+    llvm::cl::desc{"Emit HCF from AdaptiveCpp LLVM SSCP compilation flow"}};
 
 static llvm::cl::opt<bool> PreoptimizeSSCPKernels{
     "acpp-sscp-preoptimize", llvm::cl::init(false),
     llvm::cl::desc{
         "Preoptimize SYCL kernels in LLVM IR instead of embedding unoptimized kernels and relying "
-        "on optimization at runtime. This is mainly for hipSYCL developers and NOT supported!"}};
+        "on optimization at runtime. This is mainly for AdaptiveCpp developers and NOT supported!"}};
+
+static llvm::cl::opt<bool> ExportAllSymbols{
+    "acpp-sscp-export-all", llvm::cl::init(false),
+    llvm::cl::desc{
+        "(experimental) export all functions for JIT-time linking"}};
 
 static const char *SscpIsHostIdentifier = "__acpp_sscp_is_host";
 static const char *SscpIsDeviceIdentifier = "__acpp_sscp_is_device";
@@ -278,7 +284,7 @@ std::unique_ptr<llvm::Module> generateDeviceIR(llvm::Module &M,
     }
   }
 
-  EntrypointPreparationPass EPP;
+  EntrypointPreparationPass EPP{ExportAllSymbols};
   EPP.run(*DeviceModule, DeviceMAM);
   
   ExportedSymbolsOutput = EPP.getNonKernelOutliningEntrypoints();
@@ -317,7 +323,7 @@ std::unique_ptr<llvm::Module> generateDeviceIR(llvm::Module &M,
       // if they are not defined, not an intrinsic and don't start with
       // __ like our hipSYCL builtins. This is a hack, it would
       // be better if we could tell clang to annotate the declaration for us :(
-      if(!F.isIntrinsic() && !F.getName().startswith("__"))
+      if(!F.isIntrinsic() && !llvmutils::starts_with(F.getName(), "__"))
         ImportedSymbolsOutput.push_back(F.getName().str());
     }
   }
diff --git a/src/compiler/stdpar/MallocToUSM.cpp b/src/compiler/stdpar/MallocToUSM.cpp
index e5cb2c47f..d2bae051b 100644
--- a/src/compiler/stdpar/MallocToUSM.cpp
+++ b/src/compiler/stdpar/MallocToUSM.cpp
@@ -10,6 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/stdpar/MallocToUSM.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 
 
@@ -71,7 +72,7 @@ bool NameStartsWithItaniumIdentifier(llvm::StringRef Name, llvm::StringRef Ident
 
 bool isRestrictedToRegularMalloc(llvm::Function* F) {
   llvm::StringRef Name = F->getName();
-  if(!Name.startswith("_Z"))
+  if(!llvmutils::starts_with(Name, "_Z"))
     return false;
   
   if(NameStartsWithItaniumIdentifier(Name, "hipsycl"))
@@ -82,7 +83,9 @@ bool isRestrictedToRegularMalloc(llvm::Function* F) {
 
 bool isStdFunction(llvm::Function* F) {
   llvm::StringRef Name = F->getName();
-  if(Name.startswith("_ZNSt") || Name.startswith("_ZSt") || Name.startswith("_ZNKSt"))
+  if(llvmutils::starts_with(Name, "_ZNSt") ||
+     llvmutils::starts_with(Name, "_ZSt") ||
+     llvmutils::starts_with(Name, "_ZNKSt"))
     return true;
   return false;
 }
diff --git a/src/compiler/stdpar/SyncElision.cpp b/src/compiler/stdpar/SyncElision.cpp
index 506955e1d..1d192d2ca 100644
--- a/src/compiler/stdpar/SyncElision.cpp
+++ b/src/compiler/stdpar/SyncElision.cpp
@@ -10,7 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/compiler/stdpar/SyncElision.hpp"
 #include "hipSYCL/compiler/cbs/IRUtils.hpp"
-
+#include "hipSYCL/compiler/utils/LLVMUtils.hpp"
 
 #include <llvm/ADT/SmallPtrSet.h>
 #include <llvm/IR/BasicBlock.h>
@@ -82,7 +82,7 @@ void identifyStoresPotentiallyForStdparArgHandling(
                 if (StdparFunctions.contains(CB->getCalledFunction())) {
                   Users.push_back(Current);
                   return true;
-                } else if(CB->getCalledFunction()->getName().startswith("llvm.lifetime")) {
+                } else if(llvmutils::starts_with(CB->getCalledFunction()->getName(), "llvm.lifetime")) {
                   return true;
                 }
               }
@@ -134,7 +134,7 @@ bool functionDoesNotAccessMemory(llvm::Function* F){
   if(!F)
     return true;
   if(F->isIntrinsic()) {
-    if(F->getName().startswith("llvm.lifetime")){
+    if(llvmutils::starts_with(F->getName(), "llvm.lifetime")){
       return true;
     }
   }
@@ -202,7 +202,7 @@ void forEachReachableInstructionRequiringSync(
   while(Current) {
     if(auto* CB = llvm::dyn_cast<llvm::CallBase>(Current)) {
       llvm::Function* CalledF = CB->getCalledFunction();
-      if(CalledF->getName().equals(BarrierBuiltinName)) {
+      if(CalledF->getName() == BarrierBuiltinName) {
         // basic block already contains barrier; nothing to do
         return;
       }
diff --git a/src/libkernel/sscp/amdgpu/CMakeLists.txt b/src/libkernel/sscp/amdgpu/CMakeLists.txt
index da6a56050..44f34354b 100644
--- a/src/libkernel/sscp/amdgpu/CMakeLists.txt
+++ b/src/libkernel/sscp/amdgpu/CMakeLists.txt
@@ -2,6 +2,23 @@ if(WITH_LLVM_TO_AMDGPU_AMDHSA)
   libkernel_generate_bitcode_target(
       TARGETNAME amdgpu-amdhsa 
       TRIPLE amdgcn-amd-amdhsa
-      SOURCES atomic.cpp barrier.cpp core.cpp half.cpp integer.cpp math.cpp native.cpp print.cpp relational.cpp subgroup.cpp localmem.cpp
+      SOURCES 
+      atomic.cpp 
+      barrier.cpp 
+      core.cpp
+      half.cpp
+      integer.cpp
+      math.cpp
+      native.cpp
+      print.cpp
+      relational.cpp
+      subgroup.cpp
+      localmem.cpp
+      shuffle.cpp
+      collpredicate.cpp
+      reduction.cpp
+      broadcast.cpp
+      scan_inclusive.cpp
+      scan_exclusive.cpp
       ADDITIONAL_ARGS -nogpulib)
 endif()
diff --git a/src/libkernel/sscp/amdgpu/barrier.cpp b/src/libkernel/sscp/amdgpu/barrier.cpp
index 8f22c85a6..68e2a5737 100644
--- a/src/libkernel/sscp/amdgpu/barrier.cpp
+++ b/src/libkernel/sscp/amdgpu/barrier.cpp
@@ -10,62 +10,68 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/sycl/libkernel/sscp/builtins/barrier.hpp"
 
-enum amdgpu_memory_order {
-  relaxed = __ATOMIC_RELAXED,
-  acquire = __ATOMIC_ACQUIRE,
-  release = __ATOMIC_RELEASE,
-  acq_rel = __ATOMIC_ACQ_REL,
-  seq_cst = __ATOMIC_SEQ_CST
-};
-
-enum amdgpu_memory_scope {
-  work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
-  work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
-  device = __OPENCL_MEMORY_SCOPE_DEVICE,
-  all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-  sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
-};
-
-
-#define __CLK_LOCAL_MEM_FENCE    0x01
-
-
-extern "C" void
-__atomic_work_item_fence(unsigned mem_fence_flags, amdgpu_memory_order, amdgpu_memory_scope);
-
-__attribute__((always_inline)) amdgpu_memory_order
-__acpp_amdgpu_get_mem_order(__acpp_sscp_memory_order order) {
-  if(order == __acpp_sscp_memory_order::acq_rel)
-    return acq_rel;
-  else if(order == __acpp_sscp_memory_order::acquire)
-    return acquire;
-  else if(order == __acpp_sscp_memory_order::release)
-    return release;
-  else if(order == __acpp_sscp_memory_order::relaxed)
-    return relaxed;
-  else
-    return seq_cst;
-}
-
 __attribute__((always_inline))
 void __acpp_amdgpu_local_barrier() {
-  __atomic_work_item_fence(__CLK_LOCAL_MEM_FENCE, release, work_group);
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
   __builtin_amdgcn_s_barrier();
-  __atomic_work_item_fence(__CLK_LOCAL_MEM_FENCE, acquire, work_group);
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 }
 
 __attribute__((always_inline)) void
 __acpp_amdgpu_mem_fence(__acpp_sscp_memory_scope fence_scope,
                         __acpp_sscp_memory_order order) {
 
-  auto mem_order = __acpp_amdgpu_get_mem_order(order);
-
   if(fence_scope == __acpp_sscp_memory_scope::work_group) {
-    __atomic_work_item_fence(0, mem_order, work_group);
+    switch(order) {
+    case __acpp_sscp_memory_order::relaxed:
+      break;
+    case __acpp_sscp_memory_order::acquire:
+      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+      break;
+    case __acpp_sscp_memory_order::release:
+      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+      break;
+    case __acpp_sscp_memory_order::acq_rel:
+      __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "workgroup");
+      break;
+    case __acpp_sscp_memory_order::seq_cst:
+      __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
+      break;
+    }
   } else if(fence_scope == __acpp_sscp_memory_scope::device) {
-    __atomic_work_item_fence(0, mem_order, device);
+    switch(order) {
+    case __acpp_sscp_memory_order::relaxed:
+      break;
+    case __acpp_sscp_memory_order::acquire:
+      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
+      break;
+    case __acpp_sscp_memory_order::release:
+      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent");
+      break;
+    case __acpp_sscp_memory_order::acq_rel:
+      __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "agent");
+      break;
+    case __acpp_sscp_memory_order::seq_cst:
+      __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
+      break;
+    }
   } else if(fence_scope == __acpp_sscp_memory_scope::system) {
-    __atomic_work_item_fence(0, mem_order, all_svm_devices);
+    switch(order) {
+    case __acpp_sscp_memory_order::relaxed:
+      break;
+    case __acpp_sscp_memory_order::acquire:
+      __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "");
+      break;
+    case __acpp_sscp_memory_order::release:
+      __builtin_amdgcn_fence(__ATOMIC_RELEASE, "");
+      break;
+    case __acpp_sscp_memory_order::acq_rel:
+      __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "");
+      break;
+    case __acpp_sscp_memory_order::seq_cst:
+      __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
+      break;
+    }
   }
 }
 
diff --git a/src/libkernel/sscp/amdgpu/broadcast.cpp b/src/libkernel/sscp/amdgpu/broadcast.cpp
new file mode 100644
index 000000000..c465eb57d
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/broadcast.cpp
@@ -0,0 +1,38 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+#define ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                                \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##input_type shrd_x[1];                                            \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, &shrd_x[0]);                          \
+  }
+
+ACPP_WORKGROUP_BCAST(i8, int8)
+ACPP_WORKGROUP_BCAST(i16, int16)
+ACPP_WORKGROUP_BCAST(i32, int32)
+ACPP_WORKGROUP_BCAST(i64, int64)
+
+ACPP_SUBGROUP_BCAST(i8, int8)
+ACPP_SUBGROUP_BCAST(i16, int16)
+ACPP_SUBGROUP_BCAST(i32, int32)
+ACPP_SUBGROUP_BCAST(i64, int64)
\ No newline at end of file
diff --git a/src/libkernel/sscp/amdgpu/collpredicate.cpp b/src/libkernel/sscp/amdgpu/collpredicate.cpp
new file mode 100644
index 000000000..102f4e53f
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/collpredicate.cpp
@@ -0,0 +1,47 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __ockl_wfany_i32(pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/amdgpu/reduction.cpp b/src/libkernel/sscp/amdgpu/reduction.cpp
new file mode 100644
index 000000000..6ba3e85f9
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/reduction.cpp
@@ -0,0 +1,148 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/amdgpu/scan_exclusive.cpp b/src/libkernel/sscp/amdgpu/scan_exclusive.cpp
new file mode 100644
index 000000000..ec319c79d
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/scan_exclusive.cpp
@@ -0,0 +1,162 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0], init);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0], init);                         \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0], init);                          \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/amdgpu/scan_inclusive.cpp b/src/libkernel/sscp/amdgpu/scan_inclusive.cpp
new file mode 100644
index 000000000..9efa6ddeb
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/scan_inclusive.cpp
@@ -0,0 +1,151 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/amdgpu/shuffle.cpp b/src/libkernel/sscp/amdgpu/shuffle.cpp
new file mode 100644
index 000000000..cf86a1889
--- /dev/null
+++ b/src/libkernel/sscp/amdgpu/shuffle.cpp
@@ -0,0 +1,136 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/subgroup.hpp"
+
+namespace detail {
+static inline unsigned int __lane_id() {
+  return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
+}
+} // namespace detail
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shl_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shl_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  auto sg_size = __acpp_sscp_get_subgroup_max_size();
+  int self = detail::__lane_id();
+  int index = (self + delta);
+  index = (int)((self & (sg_size - 1)) + delta) > sg_size ? self : index;
+
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shl_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shl_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shl_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shr_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shr_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shr_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  int self = detail::__lane_id();
+  int width = __acpp_sscp_get_subgroup_max_size();
+  int index = self - delta;
+  index = (index < (self & ~(width - 1))) ? self : index;
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shr_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shr_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shr_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_permute_i8(__acpp_int8 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_permute_i16(__acpp_int16 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_permute_i32(__acpp_int32 value, __acpp_int32 mask) {
+  int self = detail::__lane_id();
+  int index = self ^ mask;
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_permute_i64(__acpp_int64 value, __acpp_int32 mask) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_permute_i32(tmp[0], mask);
+  tmp[1] = __acpp_sscp_sub_group_permute_i32(tmp[1], mask);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_select_i8(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_select_i16(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_select_i32(__acpp_int32 value, __acpp_int32 id) {
+  int max_subgroup_size = __acpp_sscp_get_subgroup_max_size();
+  int index = id % max_subgroup_size;
+  return __builtin_amdgcn_ds_bpermute(index << 2, value);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value, __acpp_int32 id) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_select_i32(tmp[0], id);
+  tmp[1] = __acpp_sscp_sub_group_select_i32(tmp[1], id);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
diff --git a/src/libkernel/sscp/host/CMakeLists.txt b/src/libkernel/sscp/host/CMakeLists.txt
index 35e7e43ff..dae895689 100644
--- a/src/libkernel/sscp/host/CMakeLists.txt
+++ b/src/libkernel/sscp/host/CMakeLists.txt
@@ -11,10 +11,16 @@ if(WITH_LLVM_TO_HOST)
     half.cpp
     math.cpp
     native.cpp
+    shuffle.cpp
     print.cpp
     relational.cpp
     localmem.cpp
-    subgroup.cpp)
+    subgroup.cpp
+    reduction.cpp
+    broadcast.cpp
+    scan_inclusive.cpp
+    scan_exclusive.cpp
+    collpredicate.cpp)
 
   libkernel_generate_bitcode_target(
       TARGETNAME host 
diff --git a/src/libkernel/sscp/host/broadcast.cpp b/src/libkernel/sscp/host/broadcast.cpp
new file mode 100644
index 000000000..c7df83746
--- /dev/null
+++ b/src/libkernel/sscp/host/broadcast.cpp
@@ -0,0 +1,40 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define HOST_ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                           \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    __acpp_##input_type *shrd_x =                                                                  \
+        static_cast<__acpp_##input_type *>(__acpp_sscp_host_get_internal_local_memory());          \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, shrd_x);                              \
+  }
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+HOST_ACPP_WORKGROUP_BCAST(i8, int8)
+HOST_ACPP_WORKGROUP_BCAST(i16, int16)
+HOST_ACPP_WORKGROUP_BCAST(i32, int32)
+HOST_ACPP_WORKGROUP_BCAST(i64, int64)
+
+ACPP_SUBGROUP_BCAST(i8, int8)
+ACPP_SUBGROUP_BCAST(i16, int16)
+ACPP_SUBGROUP_BCAST(i32, int32)
+ACPP_SUBGROUP_BCAST(i64, int64)
diff --git a/src/libkernel/sscp/host/collpredicate.cpp b/src/libkernel/sscp/host/collpredicate.cpp
new file mode 100644
index 000000000..6138fedb4
--- /dev/null
+++ b/src/libkernel/sscp/host/collpredicate.cpp
@@ -0,0 +1,48 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#define ACPP_SSCP_OMP_LIBKERNEL
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/host/localmem.cpp b/src/libkernel/sscp/host/localmem.cpp
index b2fdadc29..f136b54be 100644
--- a/src/libkernel/sscp/host/localmem.cpp
+++ b/src/libkernel/sscp/host/localmem.cpp
@@ -11,7 +11,21 @@
 #include "hipSYCL/sycl/libkernel/sscp/builtins/localmem.hpp"
 
 extern "C" void* __acpp_cbs_sscp_dynamic_local_memory;
+extern "C" void* __acpp_cbs_sscp_internal_local_memory;
 
+HIPSYCL_SSCP_BUILTIN
 __attribute__((address_space(3))) void* __acpp_sscp_get_dynamic_local_memory() {
-  return (__attribute__((address_space(3))) void*)(__acpp_cbs_sscp_dynamic_local_memory);
+
+  // We rely on the host side allocating page-aligned memory. On all relevant
+  // systems, the page size is larger than 512 bytes, so using this as a
+  // conservative minimum alignment seems safe.
+  return (__attribute__((address_space(3))) void *)(__builtin_assume_aligned(
+      __acpp_cbs_sscp_dynamic_local_memory, 512));
 }
+
+
+HIPSYCL_SSCP_BUILTIN
+void* __acpp_sscp_host_get_internal_local_memory() {
+  return (void *)(__builtin_assume_aligned(
+      __acpp_cbs_sscp_internal_local_memory, 512));
+}
\ No newline at end of file
diff --git a/src/libkernel/sscp/host/math.cpp b/src/libkernel/sscp/host/math.cpp
index 1881230e8..9ee91df92 100644
--- a/src/libkernel/sscp/host/math.cpp
+++ b/src/libkernel/sscp/host/math.cpp
@@ -8,7 +8,7 @@
  * See file LICENSE in the project root for full license details.
  */
 // SPDX-License-Identifier: BSD-2-Clause
-#include <math.h>
+#include <cmath>
 
 #include "hipSYCL/sycl/libkernel/sscp/builtins/builtin_config.hpp"
 #include "hipSYCL/sycl/libkernel/sscp/builtins/math.hpp"
diff --git a/src/libkernel/sscp/host/print.cpp b/src/libkernel/sscp/host/print.cpp
index decb8890d..7801916f3 100644
--- a/src/libkernel/sscp/host/print.cpp
+++ b/src/libkernel/sscp/host/print.cpp
@@ -10,7 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/sycl/libkernel/sscp/builtins/print.hpp"
 
-#include <stdio.h>
+#include <cstdio>
 
 void __acpp_sscp_print(const char* msg) {
   puts(msg);
diff --git a/src/libkernel/sscp/host/reduction.cpp b/src/libkernel/sscp/host/reduction.cpp
new file mode 100644
index 000000000..daf220a66
--- /dev/null
+++ b/src/libkernel/sscp/host/reduction.cpp
@@ -0,0 +1,152 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, shrd_mem);                                          \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, shrd_mem);                                           \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, shrd_mem);                                           \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, shrd_mem);                                          \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, shrd_mem);                                           \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, shrd_mem);                                           \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, shrd_mem);                                       \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, shrd_mem);                                        \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, shrd_mem);                                       \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, shrd_mem);                                   \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, shrd_mem);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/host/relational.cpp b/src/libkernel/sscp/host/relational.cpp
index 4dd52b310..1b4cc3908 100644
--- a/src/libkernel/sscp/host/relational.cpp
+++ b/src/libkernel/sscp/host/relational.cpp
@@ -10,7 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include "hipSYCL/sycl/libkernel/sscp/builtins/relational.hpp"
 
-#include <math.h>
+#include <cmath>
 
 #define HIPSYCL_SSCP_MAP_HOST_REL_BUILTIN(name)                                \
   HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_##name##_f32(float x) {        \
@@ -29,8 +29,8 @@ HIPSYCL_SSCP_MAP_HOST_REL_BUILTIN(isfinite)
 HIPSYCL_SSCP_MAP_HOST_REL_BUILTIN(isnormal)
 
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_signbit_f32(float x) {
-  return signbit(x);
+  return std::signbit(x);
 }
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_signbit_f64(double x) {
-  return signbit(x);
+  return std::signbit(x);
 }
diff --git a/src/libkernel/sscp/host/scan_exclusive.cpp b/src/libkernel/sscp/host/scan_exclusive.cpp
new file mode 100644
index 000000000..0087cad16
--- /dev/null
+++ b/src/libkernel/sscp/host/scan_exclusive.cpp
@@ -0,0 +1,165 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#define ACPP_SSCP_OMP_LIBKERNEL
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::plus{},     \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::multiply{}, \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::min{},      \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::max{},      \
+                                                          shrd_mem, init);                         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::plus{},     \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::multiply{}, \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::min{},      \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::max{},      \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::bit_and{},  \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::bit_or{},   \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(x, hipsycl::libkernel::sscp::bit_xor{},  \
+                                                          shrd_mem, init);                         \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(                                         \
+          x, hipsycl::libkernel::sscp::logical_and{}, shrd_mem, init);                             \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_host_scan<true>(                                         \
+          x, hipsycl::libkernel::sscp::logical_or{}, shrd_mem, init);                              \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/host/scan_inclusive.cpp b/src/libkernel/sscp/host/scan_inclusive.cpp
new file mode 100644
index 000000000..ef7445df1
--- /dev/null
+++ b/src/libkernel/sscp/host/scan_inclusive.cpp
@@ -0,0 +1,154 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#define ACPP_SSCP_OMP_LIBKERNEL
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_host.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+HIPSYCL_SSCP_BUILTIN void *__acpp_sscp_host_get_internal_local_memory();
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::plus{},    \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::min{},     \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::max{},     \
+                                                           shrd_mem);                              \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    __acpp_##type *shrd_mem =                                                                      \
+        static_cast<__acpp_##type *>(__acpp_sscp_host_get_internal_local_memory());                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::plus{},    \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::multiply{}, shrd_mem);                                      \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::min{},     \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::max{},     \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::bit_and{}, \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::bit_or{},  \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(x, hipsycl::libkernel::sscp::bit_xor{}, \
+                                                           shrd_mem);                              \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::logical_and{}, shrd_mem);                                   \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_host_scan<false>(                                        \
+          x, hipsycl::libkernel::sscp::logical_or{}, shrd_mem);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/host/shuffle.cpp b/src/libkernel/sscp/host/shuffle.cpp
new file mode 100644
index 000000000..606d86255
--- /dev/null
+++ b/src/libkernel/sscp/host/shuffle.cpp
@@ -0,0 +1,53 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+
+#define SUBGROUP_SIZE_ONE_SHUFLLE(int_size, direction)                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_int##int_size __acpp_sscp_sub_group_##direction##_i##int_size(__acpp_int##int_size value, \
+                                                                       __acpp_uint32 delta) {      \
+    return delta == 0 ? value : 0;                                                                 \
+  }
+
+SUBGROUP_SIZE_ONE_SHUFLLE(8, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(16, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(32, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(64, shl)
+SUBGROUP_SIZE_ONE_SHUFLLE(8, shr)
+SUBGROUP_SIZE_ONE_SHUFLLE(16, shr)
+SUBGROUP_SIZE_ONE_SHUFLLE(32, shr)
+SUBGROUP_SIZE_ONE_SHUFLLE(64, shr)
+
+#define SUBGROUP_SIZE_ONE_PERMUTE(int_size)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_int##int_size __acpp_sscp_sub_group_permute_i##int_size(__acpp_int##int_size value,       \
+                                                                 __acpp_int32 mask) {              \
+    return mask xor 0 ? value : 0;                                                                 \
+  }
+
+SUBGROUP_SIZE_ONE_PERMUTE(8)
+SUBGROUP_SIZE_ONE_PERMUTE(16)
+SUBGROUP_SIZE_ONE_PERMUTE(32)
+SUBGROUP_SIZE_ONE_PERMUTE(64)
+
+#define SUBGROUP_SIZE_ONE_SELECT(int_size)                                                         \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_int##int_size __acpp_sscp_sub_group_select_i##int_size(__acpp_int##int_size value,        \
+                                                                __acpp_int32 mask) {               \
+    return mask == 0 ? value : 0;                                                                  \
+  }
+
+SUBGROUP_SIZE_ONE_SELECT(8)
+SUBGROUP_SIZE_ONE_SELECT(16)
+SUBGROUP_SIZE_ONE_SELECT(32)
+SUBGROUP_SIZE_ONE_SELECT(64)
diff --git a/src/libkernel/sscp/ptx/CMakeLists.txt b/src/libkernel/sscp/ptx/CMakeLists.txt
index 063c0a507..a5c4367de 100644
--- a/src/libkernel/sscp/ptx/CMakeLists.txt
+++ b/src/libkernel/sscp/ptx/CMakeLists.txt
@@ -3,6 +3,23 @@ if(WITH_LLVM_TO_PTX)
   libkernel_generate_bitcode_target(
       TARGETNAME ptx 
       TRIPLE nvptx64-nvidia-cuda
-      SOURCES atomic.cpp barrier.cpp core.cpp half.cpp integer.cpp print.cpp relational.cpp math.cpp native.cpp localmem.cpp subgroup.cpp
+      SOURCES 
+      atomic.cpp
+      barrier.cpp
+      core.cpp
+      half.cpp
+      integer.cpp
+      print.cpp
+      relational.cpp
+      math.cpp
+      native.cpp
+      localmem.cpp
+      subgroup.cpp
+      shuffle.cpp
+      reduction.cpp
+      broadcast.cpp
+      scan_inclusive.cpp
+      scan_exclusive.cpp
+      collpredicate.cpp
       ADDITIONAL_ARGS -Xclang -target-feature -Xclang +sm_60)
 endif()
diff --git a/src/libkernel/sscp/ptx/atomic.cpp b/src/libkernel/sscp/ptx/atomic.cpp
index 306c26bb6..50282cd96 100644
--- a/src/libkernel/sscp/ptx/atomic.cpp
+++ b/src/libkernel/sscp/ptx/atomic.cpp
@@ -13,6 +13,8 @@
 #include "hipSYCL/sycl/libkernel/sscp/builtins/ptx/libdevice.hpp"
 
 
+extern "C" int __acpp_sscp_jit_reflect_target_arch();
+
 // Atomic definitions adapted from __clang_cuda_device_functions.h
 
 double __dAtomicAdd(double *__p, double __v) {
@@ -419,8 +421,6 @@ unsigned long long __ullAtomicXor_system(unsigned long long *__p,
 
 
 
-
-
 // ********************** atomic store ***************************
 
 // Unlike the CUDA compilation flow, the __atomic_store and __atomic_load builtin
@@ -438,32 +438,63 @@ void mem_fence(__acpp_sscp_memory_scope fence_scope) {
   }
 }
 
+
+template<class T>
+T memfenced_load(T* ptr, __acpp_sscp_memory_scope scope) {
+  mem_fence(scope);
+  T x = *ptr;
+  mem_fence(scope);
+  return x;
+}
+
+template<class T>
+void memfenced_store(T* ptr, T x, __acpp_sscp_memory_scope scope) {
+  mem_fence(scope);
+  *ptr = x;
+  mem_fence(scope);
+}
+
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i8(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int8 *ptr, __acpp_int8 x) {
-  *ptr = x;
-  mem_fence(scope);
+  memfenced_store(ptr, x, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i16(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int16 *ptr, __acpp_int16 x) {
-  *ptr = x;
-  mem_fence(scope);
+  memfenced_store(ptr, x, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr, __acpp_int32 x) {
-  *ptr = x;
-  mem_fence(scope);
+  if(__acpp_sscp_jit_reflect_target_arch() >= 70) {
+    if(scope == __acpp_sscp_memory_scope::system) {
+      if(order == __acpp_sscp_memory_order::release) {
+        asm volatile("st.release.sys.s32 [%0], %1;"
+                    :
+                    :"l"(ptr), "r"(x)
+                    : "memory");
+        return;
+      }
+    } else if(scope == __acpp_sscp_memory_scope::device) {
+      if(order == __acpp_sscp_memory_order::release) {
+        asm volatile("st.release.gpu.s32 [%0], %1;"
+                    :
+                    :"l"(ptr), "r"(x)
+                    : "memory");
+        return;
+      }
+    }
+  }
+  memfenced_store(ptr, x, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i64(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int64 *ptr, __acpp_int64 x) {
-  *ptr = x;
-  mem_fence(scope);
+  memfenced_store(ptr, x, scope);
 }
 
 
@@ -472,25 +503,47 @@ HIPSYCL_SSCP_BUILTIN void __acpp_sscp_atomic_store_i64(
 HIPSYCL_SSCP_BUILTIN __acpp_int8 __acpp_sscp_atomic_load_i8(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int8 *ptr) {
-  return *ptr;
+  return memfenced_load(ptr, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int16 __acpp_sscp_atomic_load_i16(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int16 *ptr) {
-  return *ptr;
+  return memfenced_load(ptr, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_load_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr) {
-  return *ptr;
+
+  if(__acpp_sscp_jit_reflect_target_arch() >= 70) {
+    if(scope == __acpp_sscp_memory_scope::system) {
+      if(order == __acpp_sscp_memory_order::acquire) {
+        __acpp_int32 result;
+        asm volatile("ld.acquire.sys.u32 %0,[%1];"
+                    : "=r"(result)
+                    : "l"(ptr)
+                    : "memory");
+        return result;
+      }
+    } else if(scope == __acpp_sscp_memory_scope::device) {
+      if(order == __acpp_sscp_memory_order::acquire) {
+        __acpp_int32 result;
+        asm volatile("ld.acquire.gpu.u32 %0,[%1];"
+                    : "=r"(result)
+                    : "l"(ptr)
+                    : "memory");
+        return result;
+      } 
+    }
+  }
+  return memfenced_load(ptr, scope);
 }
 
 HIPSYCL_SSCP_BUILTIN __acpp_int64 __acpp_sscp_atomic_load_i64(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int64 *ptr) {
-  return *ptr;
+  return memfenced_load(ptr, scope);
 }
 
 // for internal use only, not part of the public API
@@ -622,7 +675,21 @@ HIPSYCL_SSCP_BUILTIN bool __acpp_sscp_cmp_exch_strong_i32(
 
   __acpp_int32 old = *expected;
   if (scope == __acpp_sscp_memory_scope::system) {
-    *expected = __iAtomicCAS_system(ptr, *expected, desired);
+    if (success == __acpp_sscp_memory_order::acquire &&
+        failure == __acpp_sscp_memory_order::acquire) {
+      __acpp_int32 compare = *expected;
+      __acpp_int32 result;
+      // Documentation says u32/s32 types should be allowed,
+      // but driver currently does not accept this. So use b32
+      // instead.
+      asm volatile("atom.acquire.sys.cas.b32 %0,[%1],%2,%3;"
+                   : "=r"(result)
+                   : "l"(ptr), "r"(compare), "r"(desired)
+                   : "memory");
+      *expected = result;
+    } else {
+      *expected = __iAtomicCAS_system(ptr, *expected, desired);
+    }
   } else if (scope == __acpp_sscp_memory_scope::device) {
     *expected = __iAtomicCAS(ptr, *expected, desired);
   } else /* work group, sub group or work item */ {
@@ -822,8 +889,18 @@ HIPSYCL_SSCP_BUILTIN __acpp_int16 __acpp_sscp_atomic_fetch_add_i16(
 HIPSYCL_SSCP_BUILTIN __acpp_int32 __acpp_sscp_atomic_fetch_add_i32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_int32 *ptr, __acpp_int32 x) {
+  
   if (scope == __acpp_sscp_memory_scope::system) {
-    return __iAtomicAdd_system(ptr, x);
+    if(order == __acpp_sscp_memory_order::acq_rel) {
+      __acpp_int32 result;
+      asm volatile("atom.add.acq_rel.sys.s32 %0,[%1],%2;"
+                          : "=r"(result)
+                          : "l"(ptr), "r"(x)
+                          : "memory");
+      return result;
+    }
+    else  
+      return __iAtomicAdd_system(ptr, x);
   } else if (scope == __acpp_sscp_memory_scope::device) {
     return __iAtomicAdd(ptr, x);
   } else /* work group, sub group or work item */ {
@@ -863,7 +940,16 @@ HIPSYCL_SSCP_BUILTIN __acpp_uint32 __acpp_sscp_atomic_fetch_add_u32(
     __acpp_sscp_address_space as, __acpp_sscp_memory_order order,
     __acpp_sscp_memory_scope scope, __acpp_uint32 *ptr, __acpp_uint32 x) {
   if (scope == __acpp_sscp_memory_scope::system) {
-    return __uAtomicAdd_system(ptr, x);
+    if(order == __acpp_sscp_memory_order::acq_rel) {
+      __acpp_uint32 result;
+      asm volatile("atom.add.acq_rel.sys.u32 %0,[%1],%2;"
+                          : "=r"(result)
+                          : "l"(ptr), "r"(x)
+                          : "memory");
+      return result;
+    }
+    else  
+      return __uAtomicAdd_system(ptr, x);
   } else if (scope == __acpp_sscp_memory_scope::device) {
     return __uAtomicAdd(ptr, x);
   } else /* work group, sub group or work item */ {
diff --git a/src/libkernel/sscp/ptx/broadcast.cpp b/src/libkernel/sscp/ptx/broadcast.cpp
new file mode 100644
index 000000000..85a687283
--- /dev/null
+++ b/src/libkernel/sscp/ptx/broadcast.cpp
@@ -0,0 +1,38 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+#define ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                                \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##input_type shrd_x[1];                                            \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, &shrd_x[0]);                          \
+  }
+
+ACPP_WORKGROUP_BCAST(i8,int8)
+ACPP_WORKGROUP_BCAST(i16,int16)
+ACPP_WORKGROUP_BCAST(i32,int32)
+ACPP_WORKGROUP_BCAST(i64,int64)
+
+ACPP_SUBGROUP_BCAST(i8,int8)
+ACPP_SUBGROUP_BCAST(i16,int16)
+ACPP_SUBGROUP_BCAST(i32,int32)
+ACPP_SUBGROUP_BCAST(i64,int64)
diff --git a/src/libkernel/sscp/ptx/collpredicate.cpp b/src/libkernel/sscp/ptx/collpredicate.cpp
new file mode 100644
index 000000000..b243639fb
--- /dev/null
+++ b/src/libkernel/sscp/ptx/collpredicate.cpp
@@ -0,0 +1,47 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/ptx/reduction.cpp b/src/libkernel/sscp/ptx/reduction.cpp
new file mode 100644
index 000000000..cab784123
--- /dev/null
+++ b/src/libkernel/sscp/ptx/reduction.cpp
@@ -0,0 +1,152 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/ptx/scan_exclusive.cpp b/src/libkernel/sscp/ptx/scan_exclusive.cpp
new file mode 100644
index 000000000..0950e8095
--- /dev/null
+++ b/src/libkernel/sscp/ptx/scan_exclusive.cpp
@@ -0,0 +1,166 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0], init);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0], init);                         \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0], init);                          \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/ptx/scan_inclusive.cpp b/src/libkernel/sscp/ptx/scan_inclusive.cpp
new file mode 100644
index 000000000..70f0e1248
--- /dev/null
+++ b/src/libkernel/sscp/ptx/scan_inclusive.cpp
@@ -0,0 +1,155 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_hiplike.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_hiplike_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      __asm__ __volatile__("trap;");                                                               \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/ptx/shuffle.cpp b/src/libkernel/sscp/ptx/shuffle.cpp
new file mode 100644
index 000000000..cf8ce45c6
--- /dev/null
+++ b/src/libkernel/sscp/ptx/shuffle.cpp
@@ -0,0 +1,121 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/subgroup.hpp"
+
+constexpr unsigned int FULL_MASK = 0xffffffff;
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shl_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shl_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  // __acpp_uint32 mask = get_active_mask();
+  return __nvvm_shfl_sync_down_i32(FULL_MASK, value, delta, 0x1f);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shl_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shl_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shl_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shr_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shr_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shr_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  // __acpp_uint32 mask = get_active_mask();
+  return __nvvm_shfl_sync_up_i32(FULL_MASK, value, delta, 0);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shr_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shr_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shr_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_permute_i8(__acpp_int8 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_permute_i16(__acpp_int16 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_permute_i32(__acpp_int32 value, __acpp_int32 mask) {
+  // __acpp_uint32 active_thread_mask = get_active_mask();
+  return __nvvm_shfl_sync_bfly_i32(FULL_MASK, value, mask, 0x1f);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_permute_i64(__acpp_int64 value, __acpp_int32 mask) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_permute_i32(tmp[0], mask);
+  tmp[1] = __acpp_sscp_sub_group_permute_i32(tmp[1], mask);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_select_i8(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_select_i16(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_select_i32(__acpp_int32 value, __acpp_int32 id) {
+  return __nvvm_shfl_sync_idx_i32(FULL_MASK, value, id, 31);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value, __acpp_int32 id) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_select_i32(tmp[0], id);
+  tmp[1] = __acpp_sscp_sub_group_select_i32(tmp[1], id);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
diff --git a/src/libkernel/sscp/spirv/CMakeLists.txt b/src/libkernel/sscp/spirv/CMakeLists.txt
index 57a189762..da863b6a1 100644
--- a/src/libkernel/sscp/spirv/CMakeLists.txt
+++ b/src/libkernel/sscp/spirv/CMakeLists.txt
@@ -3,5 +3,22 @@ if(WITH_LLVM_TO_SPIRV)
   libkernel_generate_bitcode_target(
       TARGETNAME spirv 
       TRIPLE spir64-unknown-unknown
-      SOURCES atomic.cpp barrier.cpp core.cpp half.cpp math.cpp native.cpp integer.cpp print.cpp relational.cpp localmem.cpp subgroup.cpp)
+      SOURCES 
+      atomic.cpp
+      barrier.cpp
+      core.cpp
+      half.cpp
+      math.cpp
+      native.cpp
+      integer.cpp
+      print.cpp
+      relational.cpp
+      localmem.cpp
+      subgroup.cpp
+      shuffle.cpp
+      reduction.cpp
+      broadcast.cpp
+      scan_inclusive.cpp
+      scan_exclusive.cpp
+      collpredicate.cpp)
 endif()
diff --git a/src/libkernel/sscp/spirv/broadcast.cpp b/src/libkernel/sscp/spirv/broadcast.cpp
new file mode 100644
index 000000000..d9c3380d1
--- /dev/null
+++ b/src/libkernel/sscp/spirv/broadcast.cpp
@@ -0,0 +1,38 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/broadcast.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/broadcast.hpp"
+
+#define ACPP_SUBGROUP_BCAST(fn_suffix, input_type)                                                 \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_sub_group_broadcast_##fn_suffix(__acpp_int32 sender,             \
+                                                                  __acpp_##input_type x) {         \
+    return __acpp_sscp_sub_group_select_##fn_suffix(x, sender);                                    \
+  }
+
+#define ACPP_WORKGROUP_BCAST(fn_suffix, input_type)                                                \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##input_type __acpp_sscp_work_group_broadcast_##fn_suffix(__acpp_int32 sender,            \
+                                                                   __acpp_##input_type x) {        \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##input_type shrd_x[1];                                            \
+    return hipsycl::libkernel::sscp::wg_broadcast(sender, x, &shrd_x[0]);                          \
+  }
+  
+ACPP_WORKGROUP_BCAST(i8,int8)
+ACPP_WORKGROUP_BCAST(i16,int16)
+ACPP_WORKGROUP_BCAST(i32,int32)
+ACPP_WORKGROUP_BCAST(i64,int64)
+
+ACPP_SUBGROUP_BCAST(i8,int8)
+ACPP_SUBGROUP_BCAST(i16,int16)
+ACPP_SUBGROUP_BCAST(i32,int32)
+ACPP_SUBGROUP_BCAST(i64,int64)
\ No newline at end of file
diff --git a/src/libkernel/sscp/spirv/collpredicate.cpp b/src/libkernel/sscp/spirv/collpredicate.cpp
new file mode 100644
index 000000000..b243639fb
--- /dev/null
+++ b/src/libkernel/sscp/spirv/collpredicate.cpp
@@ -0,0 +1,47 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/collpredicate.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+ #include "hipSYCL/sycl/libkernel/sscp/builtins/amdgpu/ockl.hpp"
+
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_any(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_all(bool pred){
+    return __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_work_group_none(bool pred){
+    bool result_or = __acpp_sscp_work_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_all(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_and, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_any(bool pred){
+    return __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+bool __acpp_sscp_sub_group_none(bool pred){
+    bool result_or = __acpp_sscp_sub_group_reduce_i8(__acpp_sscp_algorithm_op::logical_or, pred);
+    return !result_or;
+}
diff --git a/src/libkernel/sscp/spirv/reduction.cpp b/src/libkernel/sscp/spirv/reduction.cpp
new file mode 100644
index 000000000..6ba3e85f9
--- /dev/null
+++ b/src/libkernel/sscp/spirv/reduction.cpp
@@ -0,0 +1,148 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/reduction.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/reduction.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_REDUCTION(type)                                                        \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##type(__acpp_sscp_algorithm_op op,                   \
+                                                    __acpp_##type x) {                             \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_REDUCTION(f16)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f32)
+ACPP_SUBGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_SUBGROUP_INT_REDUCTION(fn_suffix, type)                                               \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,              \
+                                                         __acpp_##type x) {                        \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::plus>(x);               \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::multiply>(x);           \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::min>(x);                \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::max>(x);                \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_and>(x);            \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_or>(x);             \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::bit_xor>(x);            \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_and>(x);        \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_reduce<__acpp_sscp_algorithm_op::logical_or>(x);         \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_REDUCTION(i8, int8)
+ACPP_SUBGROUP_INT_REDUCTION(i16, int16)
+ACPP_SUBGROUP_INT_REDUCTION(i32, int32)
+ACPP_SUBGROUP_INT_REDUCTION(i64, int64)
+ACPP_SUBGROUP_INT_REDUCTION(u8, uint8)
+ACPP_SUBGROUP_INT_REDUCTION(u16, uint16)
+ACPP_SUBGROUP_INT_REDUCTION(u32, uint32)
+ACPP_SUBGROUP_INT_REDUCTION(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_REDUCTION(type)                                                       \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##type(__acpp_sscp_algorithm_op op,                  \
+                                                     __acpp_##type x) {                            \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_REDUCTION(f16)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f32)
+ACPP_WORKGROUP_FLOAT_REDUCTION(f64)
+
+#define ACPP_WORKGROUP_INT_REDUCTION(fn_suffix, type)                                              \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_reduce_##fn_suffix(__acpp_sscp_algorithm_op op,             \
+                                                          __acpp_##type x) {                       \
+    constexpr size_t shmem_array_length = 32;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_reduce<shmem_array_length>(                              \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_REDUCTION(i8, int8)
+ACPP_WORKGROUP_INT_REDUCTION(i16, int16)
+ACPP_WORKGROUP_INT_REDUCTION(i32, int32)
+ACPP_WORKGROUP_INT_REDUCTION(i64, int64)
+ACPP_WORKGROUP_INT_REDUCTION(u8, uint8)
+ACPP_WORKGROUP_INT_REDUCTION(u16, uint16)
+ACPP_WORKGROUP_INT_REDUCTION(u32, uint32)
+ACPP_WORKGROUP_INT_REDUCTION(u64, uint64)
diff --git a/src/libkernel/sscp/spirv/scan_exclusive.cpp b/src/libkernel/sscp/spirv/scan_exclusive.cpp
new file mode 100644
index 000000000..9d92ec25d
--- /dev/null
+++ b/src/libkernel/sscp/spirv/scan_exclusive.cpp
@@ -0,0 +1,162 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_exclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x, __acpp_##type init) { \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_exclusive_scan_##fn_suffix(                                  \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::plus{},      \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::multiply{},  \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::min{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::max{},       \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_and{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_or{},    \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{},   \
+                                                         init);                                    \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_and{}, init);                                       \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_exclusive_scan(                                          \
+          x, hipsycl::libkernel::sscp::logical_or{}, init);                                        \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##type(                                      \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_exclusive_scan_##fn_suffix(                                 \
+      __acpp_sscp_algorithm_op op, __acpp_##type x, __acpp_##type init) {                          \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0], init);                                \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0], init);                            \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0], init);                                 \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0], init);                              \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0], init);                             \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0], init);                         \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, true>(                  \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0], init);                          \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/spirv/scan_inclusive.cpp b/src/libkernel/sscp/spirv/scan_inclusive.cpp
new file mode 100644
index 000000000..c1ce224af
--- /dev/null
+++ b/src/libkernel/sscp/spirv/scan_inclusive.cpp
@@ -0,0 +1,151 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/scan_inclusive.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_generic.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/scan_subgroup.hpp"
+
+#define ACPP_SUBGROUP_FLOAT_SCAN(type)                                                             \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,           \
+                                                            __acpp_##type x) {                     \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_FLOAT_SCAN(f16)
+ACPP_SUBGROUP_FLOAT_SCAN(f32)
+ACPP_SUBGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_SUBGROUP_INT_SCAN(fn_suffix, type)                                                    \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_sub_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,      \
+                                                                 __acpp_##type x) {                \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::plus{});     \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::multiply{}); \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::min{});      \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::max{});      \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_and{});  \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_or{});   \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x, hipsycl::libkernel::sscp::bit_xor{});  \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_and{}); \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::sg_inclusive_scan(x,                                        \
+                                                         hipsycl::libkernel::sscp::logical_or{});  \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_SUBGROUP_INT_SCAN(i8, int8)
+ACPP_SUBGROUP_INT_SCAN(i16, int16)
+ACPP_SUBGROUP_INT_SCAN(i32, int32)
+ACPP_SUBGROUP_INT_SCAN(i64, int64)
+ACPP_SUBGROUP_INT_SCAN(u8, uint8)
+ACPP_SUBGROUP_INT_SCAN(u16, uint16)
+ACPP_SUBGROUP_INT_SCAN(u32, uint32)
+ACPP_SUBGROUP_INT_SCAN(u64, uint64)
+
+#define ACPP_WORKGROUP_FLOAT_SCAN(type)                                                            \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##type(__acpp_sscp_algorithm_op op,          \
+                                                             __acpp_##type x) {                    \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_FLOAT_SCAN(f16)
+ACPP_WORKGROUP_FLOAT_SCAN(f32)
+ACPP_WORKGROUP_FLOAT_SCAN(f64)
+
+#define ACPP_WORKGROUP_INT_SCAN(fn_suffix, type)                                                   \
+  HIPSYCL_SSCP_CONVERGENT_BUILTIN                                                                  \
+  __acpp_##type __acpp_sscp_work_group_inclusive_scan_##fn_suffix(__acpp_sscp_algorithm_op op,     \
+                                                                  __acpp_##type x) {               \
+    constexpr size_t shmem_array_length = 33;                                                      \
+    ACPP_SHMEM_ATTRIBUTE __acpp_##type shrd_mem[shmem_array_length];                               \
+    switch (op) {                                                                                  \
+    case __acpp_sscp_algorithm_op::plus:                                                           \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::plus{}, &shrd_mem[0]);                                      \
+    case __acpp_sscp_algorithm_op::multiply:                                                       \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::multiply{}, &shrd_mem[0]);                                  \
+    case __acpp_sscp_algorithm_op::min:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::min{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::max:                                                            \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::max{}, &shrd_mem[0]);                                       \
+    case __acpp_sscp_algorithm_op::bit_and:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_and{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::bit_or:                                                         \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_or{}, &shrd_mem[0]);                                    \
+    case __acpp_sscp_algorithm_op::bit_xor:                                                        \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::bit_xor{}, &shrd_mem[0]);                                   \
+    case __acpp_sscp_algorithm_op::logical_and:                                                    \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_and{}, &shrd_mem[0]);                               \
+    case __acpp_sscp_algorithm_op::logical_or:                                                     \
+      return hipsycl::libkernel::sscp::wg_generic_scan<shmem_array_length, false>(                 \
+          x, hipsycl::libkernel::sscp::logical_or{}, &shrd_mem[0]);                                \
+    default:                                                                                       \
+      return __acpp_##type{};                                                                      \
+    }                                                                                              \
+  }
+
+ACPP_WORKGROUP_INT_SCAN(i8, int8)
+ACPP_WORKGROUP_INT_SCAN(i16, int16)
+ACPP_WORKGROUP_INT_SCAN(i32, int32)
+ACPP_WORKGROUP_INT_SCAN(i64, int64)
+ACPP_WORKGROUP_INT_SCAN(u8, uint8)
+ACPP_WORKGROUP_INT_SCAN(u16, uint16)
+ACPP_WORKGROUP_INT_SCAN(u32, uint32)
+ACPP_WORKGROUP_INT_SCAN(u64, uint64)
diff --git a/src/libkernel/sscp/spirv/shuffle.cpp b/src/libkernel/sscp/spirv/shuffle.cpp
new file mode 100644
index 000000000..82a128595
--- /dev/null
+++ b/src/libkernel/sscp/spirv/shuffle.cpp
@@ -0,0 +1,134 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/sycl/libkernel/sscp/builtins/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/detail/shuffle.hpp"
+#include "hipSYCL/sycl/libkernel/sscp/builtins/subgroup.hpp"
+
+template <typename dataT>
+dataT __spirv_SubgroupShuffleINTEL(dataT Data, __acpp_uint32 InvocationId) noexcept;
+template <typename dataT>
+dataT __spirv_SubgroupShuffleDownINTEL(dataT Current, dataT Next, __acpp_uint32 Delta) noexcept;
+template <typename dataT>
+dataT __spirv_SubgroupShuffleUpINTEL(dataT Previous, dataT Current, __acpp_uint32 Delta) noexcept;
+template <typename dataT>
+dataT __spirv_SubgroupShuffleXorINTEL(dataT Data, __acpp_uint32 Value) noexcept;
+
+template <typename ValueT, typename IdT>
+ValueT __spirv_GroupNonUniformShuffle(__acpp_uint32, ValueT, IdT) noexcept;
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shl_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shl_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shl_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shl_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  __acpp_int32 local_id = __acpp_sscp_get_subgroup_local_id();
+  __acpp_int32 target_id = local_id + delta;
+  if (target_id >= __acpp_sscp_get_subgroup_size())
+    target_id = local_id;
+  return __spirv_GroupNonUniformShuffle(3, value, target_id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shl_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shl_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shl_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_shr_i8(__acpp_int8 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_shr_i16(__acpp_int16 value, __acpp_uint32 delta) {
+  return __acpp_sscp_sub_group_shr_i32(value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_shr_i32(__acpp_int32 value, __acpp_uint32 delta) {
+  __acpp_int32 local_id = __acpp_sscp_get_subgroup_local_id();
+  __acpp_int32 target_id = local_id;
+  if (local_id >= delta)
+    target_id -= delta;
+  return __spirv_GroupNonUniformShuffle(3, value, target_id);
+  // return __spirv_SubgroupShuffleDownINTEL(value, value, delta);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_shr_i64(__acpp_int64 value, __acpp_uint32 delta) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_shr_i32(tmp[0], delta);
+  tmp[1] = __acpp_sscp_sub_group_shr_i32(tmp[1], delta);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_permute_i8(__acpp_int8 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_permute_i16(__acpp_int16 value, __acpp_int32 mask) {
+  return __acpp_sscp_sub_group_permute_i32(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_permute_i32(__acpp_int32 value, __acpp_int32 mask) {
+  return __spirv_SubgroupShuffleXorINTEL(value, mask);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_permute_i64(__acpp_int64 value, __acpp_int32 mask) {
+  __acpp_int32 local_id = __acpp_sscp_get_subgroup_local_id();
+  __acpp_int32 target_id = mask ^ local_id;
+  return __spirv_GroupNonUniformShuffle(3, value, target_id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int8 __acpp_sscp_sub_group_select_i8(__acpp_int8 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int16 __acpp_sscp_sub_group_select_i16(__acpp_int16 value, __acpp_int32 id) {
+  return __acpp_sscp_sub_group_select_i32(value, id);
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int32 __acpp_sscp_sub_group_select_i32(__acpp_int32 value, __acpp_int32 id) {
+  return __builtin_bit_cast(__acpp_int32, __spirv_GroupNonUniformShuffle(3u, value, id));
+}
+
+HIPSYCL_SSCP_CONVERGENT_BUILTIN
+__acpp_int64 __acpp_sscp_sub_group_select_i64(__acpp_int64 value, __acpp_int32 id) {
+  int tmp[2];
+  __builtin_memcpy(tmp, &value, sizeof(tmp));
+  tmp[0] = __acpp_sscp_sub_group_select_i32(tmp[0], id);
+  tmp[1] = __acpp_sscp_sub_group_select_i32(tmp[1], id);
+  __acpp_int64 result =
+      (static_cast<__acpp_int64>(tmp[1]) << 32ull) | (static_cast<__acpp_uint32>(tmp[0]));
+  return result;
+}
diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt
index b35e7f4dc..c2e67b9d5 100644
--- a/src/runtime/CMakeLists.txt
+++ b/src/runtime/CMakeLists.txt
@@ -23,6 +23,8 @@ set(HIPSYCL_RT_EXTRA_LINKER_FLAGS ${HIPSYCL_RT_EXTRA_LINKER_FLAGS} ${HIPSYCL_STD
 set(CMAKE_INSTALL_RPATH ${base} ${base}/hipSYCL)
 
 add_library(acpp-rt SHARED
+  allocator.cpp
+  allocation_tracker.cpp
   application.cpp
   runtime.cpp
   error.cpp
@@ -36,6 +38,7 @@ add_library(acpp-rt SHARED
   kernel_cache.cpp
   kernel_configuration.cpp
   multi_queue_executor.cpp
+  runtime_event_handlers.cpp
   dag.cpp
   dag_node.cpp
   dag_builder.cpp
diff --git a/src/runtime/adaptivity_engine.cpp b/src/runtime/adaptivity_engine.cpp
index fbde51a46..c630c23a4 100644
--- a/src/runtime/adaptivity_engine.cpp
+++ b/src/runtime/adaptivity_engine.cpp
@@ -12,10 +12,13 @@
 
 #include "hipSYCL/common/appdb.hpp"
 #include "hipSYCL/glue/llvm-sscp/fcall_specialization.hpp"
+#include "hipSYCL/runtime/allocation_tracker.hpp"
 #include "hipSYCL/runtime/kernel_configuration.hpp"
 #include "hipSYCL/glue/llvm-sscp/jit.hpp"
 #include "hipSYCL/runtime/application.hpp"
 #include "hipSYCL/common/filesystem.hpp"
+#include "hipSYCL/runtime/runtime_event_handlers.hpp"
+#include <cstdint>
 #include <limits>
 
 
@@ -151,6 +154,30 @@ bool is_likely_invariant_argument(common::db::kernel_entry &kernel_entry,
 
   return false;
 }
+
+int determine_ptr_alignment(uint64_t ptrval) {
+  if(ptrval == 0)
+    return 0;
+  
+#if defined(__GNUC__) && !defined(__llvm__) && !defined(__INTEL_COMPILER) && \
+    !defined(__NVCOMPILER)
+  // gcc supports __builtin_ctz, but versions prior to 10
+  // do not support __has_builtin
+  #define ACPP_HAS_BUILTIN_CTZ
+#else
+  #if __has_builtin(__builtin_ctzll)
+    #define ACPP_HAS_BUILTIN_CTZ
+  #endif
+#endif
+
+#ifdef ACPP_HAS_BUILTIN_CTZ
+  uint64_t alignment = 1ull << __builtin_ctzll(ptrval);
+  return alignment >= 32 ? 32 : 0;
+#else
+  return 0;
+#endif
+}
+
 }
 
 kernel_adaptivity_engine::kernel_adaptivity_engine(
@@ -220,10 +247,95 @@ kernel_adaptivity_engine::finalize_binary_configuration(
         std::memcpy(&buffer_value, _arg_mapper.get_mapped_args()[i], arg_size);
         config.set_specialized_kernel_argument(i, buffer_value);
       }
+
+      if (_kernel_info->get_argument_type(i) ==
+          hcf_kernel_info::argument_type::pointer) {
+        if (has_annotation(_kernel_info, i,
+                           hcf_kernel_info::annotation_type::noalias)) {
+          config.set_kernel_param_flag(i, kernel_param_flag::noalias);
+        }
+      }
+    }
+
+    // Handle auto alignment specialization
+    for(int i = 0; i < _kernel_info->get_num_parameters(); ++i) {
+      std::size_t arg_size = _kernel_info->get_argument_size(i);
+      if (_kernel_info->get_argument_type(i) == hcf_kernel_info::argument_type::pointer) {
+        uint64_t buffer = 0;
+        std::memcpy(&buffer, _arg_mapper.get_mapped_args()[i],
+                    _kernel_info->get_argument_size(i));
+
+        int alignment = determine_ptr_alignment(buffer);
+        if(alignment > 0) {
+          HIPSYCL_DEBUG_INFO
+              << "adaptivity_engine: Inferred pointer alignment of "
+              << alignment << " for kernel argument " << i << std::endl;
+          config.set_known_alignment(i, alignment);
+        }
+      }
+    }
+
+    if(application::get_settings().get<setting::enable_allocation_tracking>()) {
+      // Detect whether pointer arguments qualify for NoAlias/restrict semantics.
+      // This is achieved by determining the base of the allocations for all pointer
+      // kernel arguments, and checking whether there are other pointer arguments
+      // from the same allocation.
+      constexpr int max_allocations = 32;
+      uint64_t allocation_base_addresses [max_allocations] = {};
+      bool allocations_exceeded = false;
+      for(int alloc_index = 0, i = 0; i < _kernel_info->get_num_parameters(); ++i) {
+        if(_kernel_info->get_argument_type(i) == hcf_kernel_info::argument_type::pointer) {
+          auto arg_size = _kernel_info->get_argument_size(i);
+          if(arg_size == sizeof(void*)) {
+            void* ptr_arg;
+            std::memcpy(&ptr_arg, _arg_mapper.get_mapped_args()[i], arg_size);
+            if (ptr_arg) {
+              allocation_info ainfo;
+              uint64_t allocation_base;
+              if(allocation_tracker::query_allocation(ptr_arg, ainfo, allocation_base)) {
+                allocation_base_addresses[alloc_index] = allocation_base;
+              }
+            }
+          }
+          ++alloc_index;
+          if (alloc_index >= max_allocations) {
+            allocations_exceeded = true;
+            break;
+          }
+        }
+      }
+      if (!allocations_exceeded) {
+        for (int alloc_index = 0, i = 0; i < _kernel_info->get_num_parameters();
+            ++i) {
+          if (_kernel_info->get_argument_type(i) ==
+              hcf_kernel_info::argument_type::pointer) {
+            if (allocation_base_addresses[alloc_index] != 0) {
+              bool argument_might_alias = false;
+              for (int k = 0; k < max_allocations; ++k) {
+                if (k != alloc_index) {
+                  if (allocation_base_addresses[alloc_index] ==
+                      allocation_base_addresses[k]) {
+                    argument_might_alias = true;
+                    break;
+                  }
+                }
+              }
+              if (!argument_might_alias) {
+                HIPSYCL_DEBUG_INFO << "adaptivity_engine: Inferred noalias "
+                                      "pointer semantics for kernel argument "
+                                  << i << std::endl;
+                config.set_kernel_param_flag(i, kernel_param_flag::noalias);
+              }
+            }
+            ++alloc_index;
+          }
+        }
+      }
     }
   }
   
   if(_adaptivity_level > 1) {
+
     auto base_id = config.generate_id();
     
     // Automatic application of specialization constants by detecting
@@ -288,5 +400,6 @@ std::string kernel_adaptivity_engine::select_image_and_kernels(
     return glue::jit::select_image(_kernel_info, kernel_names_out);
   }
 }
+
 }
 }
\ No newline at end of file
diff --git a/src/runtime/allocation_tracker.cpp b/src/runtime/allocation_tracker.cpp
new file mode 100644
index 000000000..43e3dfee8
--- /dev/null
+++ b/src/runtime/allocation_tracker.cpp
@@ -0,0 +1,46 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/runtime/allocation_tracker.hpp"
+
+
+namespace hipsycl::rt {
+
+namespace {
+
+using amap_t = common::allocation_map<allocation_info>;
+
+amap_t& get_allocation_map() {
+  static amap_t amap;
+  return amap;
+}
+
+}
+
+bool allocation_tracker::register_allocation(const void *ptr, std::size_t size,
+                         const allocation_info &info) {
+  using value_type = amap_t::value_type;
+
+  value_type v;
+  v.allocation_info::operator=(info);
+  v.allocation_size = size;
+  return get_allocation_map().insert(reinterpret_cast<uint64_t>(ptr), v);
+}
+
+bool allocation_tracker::unregister_allocation(const void* ptr) {
+  return get_allocation_map().erase(reinterpret_cast<uint64_t>(ptr));
+}
+
+bool allocation_tracker::query_allocation(const void *ptr, allocation_info &out,
+                                          uint64_t &root_address) {
+  return get_allocation_map().get_entry(reinterpret_cast<uint64_t>(ptr), root_address);
+}
+}
diff --git a/src/runtime/allocator.cpp b/src/runtime/allocator.cpp
new file mode 100644
index 000000000..087650bba
--- /dev/null
+++ b/src/runtime/allocator.cpp
@@ -0,0 +1,61 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include "hipSYCL/runtime/allocator.hpp"
+#include "hipSYCL/runtime/allocation_tracker.hpp"
+#include "hipSYCL/runtime/application.hpp"
+#include "hipSYCL/runtime/runtime_event_handlers.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+void *allocate_device(backend_allocator *alloc, size_t min_alignment,
+                      size_t size_bytes) {
+  auto *ptr = alloc->raw_allocate(min_alignment, size_bytes);
+  if(ptr) {
+    application::event_handler_layer().on_new_allocation(
+        ptr, size_bytes,
+        allocation_info{alloc->get_device(),
+                        allocation_info::allocation_type::device});
+  }
+  return ptr;
+}
+
+void *allocate_host(backend_allocator *alloc, size_t min_alignment,
+                              size_t bytes) {
+  auto* ptr = alloc->raw_allocate_optimized_host(min_alignment, bytes);
+  if(ptr) {
+    application::event_handler_layer().on_new_allocation(
+        ptr, bytes,
+        allocation_info{alloc->get_device(),
+                        allocation_info::allocation_type::host});
+  }
+  return ptr;
+}
+
+void *allocate_shared(backend_allocator *alloc, size_t bytes) {
+  auto* ptr = alloc->raw_allocate_usm(bytes);
+  if(ptr) {
+    application::event_handler_layer().on_new_allocation(
+        ptr, bytes,
+        allocation_info{alloc->get_device(),
+                        allocation_info::allocation_type::host});
+  }
+  return ptr;
+}
+
+void deallocate(backend_allocator* alloc, void *mem) {
+  alloc->raw_free(mem);
+  application::event_handler_layer().on_deallocation(mem);
+}
+
+}
+}
diff --git a/src/runtime/application.cpp b/src/runtime/application.cpp
index f9ced33c0..1b7cde5b7 100644
--- a/src/runtime/application.cpp
+++ b/src/runtime/application.cpp
@@ -56,6 +56,11 @@ async_error_list& application::errors() {
   return errors;
 }
 
+runtime_event_handlers& application::event_handler_layer() {
+  static runtime_event_handlers h;
+  return h;
+}
+
 
 }
 }
diff --git a/src/runtime/cuda/cuda_allocator.cpp b/src/runtime/cuda/cuda_allocator.cpp
index 97c4f8a0a..4eeacec09 100644
--- a/src/runtime/cuda/cuda_allocator.cpp
+++ b/src/runtime/cuda/cuda_allocator.cpp
@@ -21,7 +21,7 @@ cuda_allocator::cuda_allocator(backend_descriptor desc, int cuda_device)
     : _backend_descriptor{desc}, _dev{cuda_device}
 {}
       
-void *cuda_allocator::allocate(size_t min_alignment, size_t size_bytes)
+void *cuda_allocator::raw_allocate(size_t min_alignment, size_t size_bytes)
 {
   void *ptr;
   cuda_device_manager::get().activate_device(_dev);
@@ -38,8 +38,8 @@ void *cuda_allocator::allocate(size_t min_alignment, size_t size_bytes)
   return ptr;
 }
 
-void *cuda_allocator::allocate_optimized_host(size_t min_alignment,
-                                             size_t bytes) {
+void *cuda_allocator::raw_allocate_optimized_host(size_t min_alignment,
+                                                  size_t bytes) {
   void *ptr;
   cuda_device_manager::get().activate_device(_dev);
 
@@ -55,7 +55,7 @@ void *cuda_allocator::allocate_optimized_host(size_t min_alignment,
   return ptr;
 }
 
-void cuda_allocator::free(void *mem) {
+void cuda_allocator::raw_free(void *mem) {
 
   pointer_info info;
   result query_result = query_pointer(mem, info);
@@ -79,7 +79,7 @@ void cuda_allocator::free(void *mem) {
   }
 }
 
-void * cuda_allocator::allocate_usm(size_t bytes)
+void * cuda_allocator::raw_allocate_usm(size_t bytes)
 {
   cuda_device_manager::get().activate_device(_dev);
   
@@ -157,5 +157,9 @@ result cuda_allocator::mem_advise(const void *addr, std::size_t num_bytes,
   return make_success();
 }
 
+device_id cuda_allocator::get_device() const {
+  return device_id{_backend_descriptor, _dev};
+}
+
 }
 }
diff --git a/src/runtime/cuda/cuda_hardware_manager.cpp b/src/runtime/cuda/cuda_hardware_manager.cpp
index dd00c562e..83a073c87 100644
--- a/src/runtime/cuda/cuda_hardware_manager.cpp
+++ b/src/runtime/cuda/cuda_hardware_manager.cpp
@@ -85,6 +85,13 @@ device_id cuda_hardware_manager::get_device_id(std::size_t index) const {
                    static_cast<int>(index)};
 }
 
+std::size_t cuda_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+std::size_t cuda_hardware_context::get_platform_index() const {
+  return 0;
+}
 
 cuda_hardware_context::cuda_hardware_context(int dev) 
   : _dev{dev} {
@@ -359,6 +366,12 @@ cuda_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return 4318;
     break;
+  case device_uint_property::architecture:
+    return _properties->major * 10 + _properties->minor;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::cuda);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
diff --git a/src/runtime/cuda/cuda_queue.cpp b/src/runtime/cuda/cuda_queue.cpp
index 07fa5ba10..6e7726edb 100644
--- a/src/runtime/cuda/cuda_queue.cpp
+++ b/src/runtime/cuda/cuda_queue.cpp
@@ -176,6 +176,9 @@ cuda_queue::cuda_queue(cuda_backend *be, device_id dev, int priority)
       _kernel_cache{kernel_cache::get()} {
   this->activate_device();
 
+  _reflection_map = glue::jit::construct_default_reflection_map(
+      be->get_hardware_manager()->get_device(dev.get_id()));
+
   cudaError_t err;
   if(priority == 0) {
     err = cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking);
@@ -644,10 +647,11 @@ result cuda_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
-      err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+      err =
+          glue::jit::compile(translator.get(), hcf_object, selected_image_name,
+                             _config, _reflection_map, compiled_image);
     }
 
     if(!err.is_success()) {
diff --git a/src/runtime/dag_direct_scheduler.cpp b/src/runtime/dag_direct_scheduler.cpp
index 72196273b..ce7d6e9c3 100644
--- a/src/runtime/dag_direct_scheduler.cpp
+++ b/src/runtime/dag_direct_scheduler.cpp
@@ -91,7 +91,7 @@ result ensure_allocation_exists(runtime *rt,
     // cause backends to align to the largest supported type.
     // TODO: A better solution might be to select a custom alignment
     // best on sizeof(T). This requires querying backend alignment capabilities.
-    void *ptr = allocator->allocate(0, num_bytes);
+    void *ptr = rt::allocate_device(allocator, 0, num_bytes);
 
     if(!ptr)
       return register_error(
diff --git a/src/runtime/hip/hip_allocator.cpp b/src/runtime/hip/hip_allocator.cpp
index 15fffa047..13335d01c 100644
--- a/src/runtime/hip/hip_allocator.cpp
+++ b/src/runtime/hip/hip_allocator.cpp
@@ -20,7 +20,7 @@ hip_allocator::hip_allocator(backend_descriptor desc, int hip_device)
     : _backend_descriptor{desc}, _dev{hip_device}
 {}
       
-void *hip_allocator::allocate(size_t min_alignment, size_t size_bytes)
+void *hip_allocator::raw_allocate(size_t min_alignment, size_t size_bytes)
 {
   void *ptr;
   hip_device_manager::get().activate_device(_dev);
@@ -37,8 +37,8 @@ void *hip_allocator::allocate(size_t min_alignment, size_t size_bytes)
   return ptr;
 }
 
-void *hip_allocator::allocate_optimized_host(size_t min_alignment,
-                                             size_t bytes) {
+void *hip_allocator::raw_allocate_optimized_host(size_t min_alignment,
+                                                size_t bytes) {
   void *ptr;
   hip_device_manager::get().activate_device(_dev);
 
@@ -54,7 +54,7 @@ void *hip_allocator::allocate_optimized_host(size_t min_alignment,
   return ptr;
 }
 
-void hip_allocator::free(void *mem) {
+void hip_allocator::raw_free(void *mem) {
 
   pointer_info info;
   result query_result = query_pointer(mem, info);
@@ -78,7 +78,7 @@ void hip_allocator::free(void *mem) {
   }
 }
 
-void * hip_allocator::allocate_usm(size_t bytes)
+void * hip_allocator::raw_allocate_usm(size_t bytes)
 {
   hip_device_manager::get().activate_device(_dev);
 
@@ -173,5 +173,9 @@ result hip_allocator::mem_advise(const void *addr, std::size_t num_bytes,
   return make_success();
 }
 
+device_id hip_allocator::get_device() const {
+  return device_id{_backend_descriptor, _dev};
+}
+
 }
 }
diff --git a/src/runtime/hip/hip_hardware_manager.cpp b/src/runtime/hip/hip_hardware_manager.cpp
index 5455703e1..7ced6676e 100644
--- a/src/runtime/hip/hip_hardware_manager.cpp
+++ b/src/runtime/hip/hip_hardware_manager.cpp
@@ -17,10 +17,37 @@
 #include <exception>
 #include <cstdlib>
 #include <limits>
+#include <cctype>
 
 namespace hipsycl {
 namespace rt {
 
+namespace {
+
+
+int device_arch_string_to_int(const std::string& device_name) {
+  std::string prefix = "gfx";
+  
+  if(device_name.find(prefix) != 0)
+    return 0;
+  
+  std::string substr = device_name;
+  substr.erase(0, prefix.length());
+
+  auto colon_pos = substr.find(":");
+  if(colon_pos != std::string::npos) {
+    substr.erase(colon_pos);
+  }
+
+  for(int i = 0; i < substr.length(); ++i) {
+    if(!std::isxdigit(substr[i]))
+      return 0;
+  }
+  return std::stoi(substr, nullptr, 16);
+}
+
+}
+
 hip_hardware_manager::hip_hardware_manager(hardware_platform hw_platform)
     : _hw_platform(hw_platform) {
   
@@ -81,6 +108,13 @@ device_id hip_hardware_manager::get_device_id(std::size_t index) const {
                    static_cast<int>(index)};
 }
 
+std::size_t hip_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+std::size_t hip_hardware_context::get_platform_index() const {
+  return 0;
+}
 
 hip_hardware_context::hip_hardware_context(int dev) : _dev{dev} {
   _properties = std::make_unique<hipDeviceProp_t>();
@@ -97,6 +131,8 @@ hip_hardware_context::hip_hardware_context(int dev) : _dev{dev} {
   _allocator = std::make_unique<hip_allocator>(
       backend_descriptor{hardware_platform::rocm, api_platform::hip}, _dev);
   _event_pool = std::make_unique<hip_event_pool>(_dev);
+
+  _numeric_architecture = device_arch_string_to_int(get_device_arch());
 }
 
 hip_allocator* hip_hardware_context::get_allocator() const {
@@ -366,6 +402,11 @@ hip_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return 1022;
     break;
+  case device_uint_property::architecture:
+    return _numeric_architecture;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::hip);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
diff --git a/src/runtime/hip/hip_queue.cpp b/src/runtime/hip/hip_queue.cpp
index dd513ddc4..8d0e8f16a 100644
--- a/src/runtime/hip/hip_queue.cpp
+++ b/src/runtime/hip/hip_queue.cpp
@@ -167,6 +167,9 @@ hip_queue::hip_queue(hip_backend *be, device_id dev, int priority)
       _kernel_cache{kernel_cache::get()} {
   this->activate_device();
 
+  _reflection_map = glue::jit::construct_default_reflection_map(
+      be->get_hardware_manager()->get_device(dev.get_id()));
+
   hipError_t err;
   if(priority == 0) {
     err = hipStreamCreateWithFlags(&_stream, hipStreamNonBlocking);
@@ -642,10 +645,11 @@ result hip_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
-      err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+      err =
+          glue::jit::compile(translator.get(), hcf_object, selected_image_name,
+                             _config, _reflection_map, compiled_image);
     }
     
     if(!err.is_success()) {
diff --git a/src/runtime/kernel_cache.cpp b/src/runtime/kernel_cache.cpp
index 748b620c6..c90c0dd22 100644
--- a/src/runtime/kernel_cache.cpp
+++ b/src/runtime/kernel_cache.cpp
@@ -104,6 +104,8 @@ hcf_kernel_info::hcf_kernel_info(
           } else if(entry.first == "fcall_specialized_config") {
             _known_annotations.back().push_back(
                 annotation_type::fcall_specialized_config);
+          } else if(entry.first == "restrict") {
+            _known_annotations.back().push_back(annotation_type::noalias);
           } else {
             _string_annotations.back().push_back(entry.first);
           }
diff --git a/src/runtime/ocl/ocl_allocator.cpp b/src/runtime/ocl/ocl_allocator.cpp
index 35a3a338c..3312d85d1 100644
--- a/src/runtime/ocl/ocl_allocator.cpp
+++ b/src/runtime/ocl/ocl_allocator.cpp
@@ -17,10 +17,10 @@
 namespace hipsycl {
 namespace rt {
 
-ocl_allocator::ocl_allocator(ocl_usm* usm)
-: _usm{usm} {}
+ocl_allocator::ocl_allocator(rt::device_id dev, ocl_usm* usm)
+: _dev{dev}, _usm{usm} {}
 
-void* ocl_allocator::allocate(size_t min_alignment, size_t size_bytes) {
+void* ocl_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
                    error_info{"ocl_allocator: OpenCL device does not have valid USM provider",
@@ -40,7 +40,7 @@ void* ocl_allocator::allocate(size_t min_alignment, size_t size_bytes) {
   return ptr;
 }
 
-void *ocl_allocator::allocate_optimized_host(size_t min_alignment,
+void *ocl_allocator::raw_allocate_optimized_host(size_t min_alignment,
                                              size_t bytes) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
@@ -60,7 +60,7 @@ void *ocl_allocator::allocate_optimized_host(size_t min_alignment,
   return ptr;
 }
 
-void ocl_allocator::free(void *mem) {
+void ocl_allocator::raw_free(void *mem) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
                    error_info{"ocl_allocator: OpenCL device does not have valid USM provider",
@@ -76,7 +76,7 @@ void ocl_allocator::free(void *mem) {
   }
 }
 
-void *ocl_allocator::allocate_usm(size_t bytes) {
+void *ocl_allocator::raw_allocate_usm(size_t bytes) {
   if(!_usm->is_available()) {
     register_error(__acpp_here(),
                    error_info{"ocl_allocator: OpenCL device does not have valid USM provider",
@@ -103,6 +103,10 @@ bool ocl_allocator::is_usm_accessible_from(backend_descriptor b) const {
   return b.hw_platform == hardware_platform::ocl;
 }
 
+device_id ocl_allocator::get_device() const {
+  return _dev;
+}
+
 result ocl_allocator::query_pointer(const void* ptr, pointer_info& out) const {
   if(!_usm->is_available()) {
     auto err = make_error(__acpp_here(),
diff --git a/src/runtime/ocl/ocl_code_object.cpp b/src/runtime/ocl/ocl_code_object.cpp
index c0398eb43..ad352186f 100644
--- a/src/runtime/ocl/ocl_code_object.cpp
+++ b/src/runtime/ocl/ocl_code_object.cpp
@@ -54,7 +54,7 @@ ocl_executable_object::ocl_executable_object(const cl::Context& ctx, cl::Device&
   std::string options_string="-cl-uniform-work-group-size";
   for(const auto& flag : config.build_flags()) {
     if(flag == kernel_build_flag::fast_math) {
-      options_string += " -cl-fast-relaxed-math";
+      options_string += " -cl-fast-relaxed-math -cl-denorms-are-zero";
     }
   }
 
diff --git a/src/runtime/ocl/ocl_hardware_manager.cpp b/src/runtime/ocl/ocl_hardware_manager.cpp
index b6c147154..3cf9624b3 100644
--- a/src/runtime/ocl/ocl_hardware_manager.cpp
+++ b/src/runtime/ocl/ocl_hardware_manager.cpp
@@ -152,7 +152,7 @@ bool should_include_device(const std::string& dev_name, const cl::Device& dev) {
       info_query<CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities>(dev);
 
   bool has_usm_extension = info_query<CL_DEVICE_EXTENSIONS, std::string>(dev).find("cl_intel_unified_shared_memory") != std::string::npos;
-  bool has_system_svm = !(cap & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
+  bool has_system_svm = cap & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM;
 
   if(!has_usm_extension && !has_system_svm) {
     HIPSYCL_DEBUG_WARNING << "ocl_hardware_manager: OpenCL device '" << dev_name
@@ -469,6 +469,13 @@ std::size_t ocl_hardware_context::get_property(device_uint_property prop) const
     return static_cast<std::size_t>(
         info_query<CL_DEVICE_VENDOR_ID, cl_uint>(_dev));
     break;
+  case device_uint_property::architecture:
+    // TODO
+    return 0;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::ocl);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
@@ -561,7 +568,18 @@ void ocl_hardware_context::init_allocator(ocl_hardware_manager *mgr) {
                              "allocations are not possible on that device."
                           << std::endl;
   }
-  _alloc = ocl_allocator{_usm_provider.get()};
+  device_id dev{
+      backend_descriptor{hardware_platform::ocl, api_platform::ocl},
+      _dev_id};
+  _alloc = ocl_allocator{dev, _usm_provider.get()};
+}
+
+std::size_t ocl_hardware_context::get_platform_index() const {
+  return static_cast<std::size_t>(_platform_id);
+}
+
+std::size_t ocl_hardware_manager::get_num_platforms() const {
+  return _platforms.size();
 }
 
 ocl_hardware_manager::ocl_hardware_manager()
diff --git a/src/runtime/ocl/ocl_queue.cpp b/src/runtime/ocl/ocl_queue.cpp
index 25690afb5..e3dea185a 100644
--- a/src/runtime/ocl/ocl_queue.cpp
+++ b/src/runtime/ocl/ocl_queue.cpp
@@ -125,6 +125,8 @@ ocl_queue::ocl_queue(ocl_hardware_manager* hw_manager, std::size_t device_index)
                    error_info{"ocl_queue: Couldn't construct backend queue",
                               error_code{"CL", err}});
   }
+
+  _reflection_map = glue::jit::construct_default_reflection_map(dev_ctx);
 }
 
 ocl_queue::~ocl_queue() {}
@@ -517,10 +519,11 @@ result ocl_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
-      err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+      err =
+          glue::jit::compile(translator.get(), hcf_object, selected_image_name,
+                             _config, _reflection_map, compiled_image);
     }
     
     if(!err.is_success()) {
diff --git a/src/runtime/omp/omp_allocator.cpp b/src/runtime/omp/omp_allocator.cpp
index 10ee049ca..d866fccd3 100644
--- a/src/runtime/omp/omp_allocator.cpp
+++ b/src/runtime/omp/omp_allocator.cpp
@@ -21,7 +21,14 @@ namespace rt {
 omp_allocator::omp_allocator(const device_id &my_device)
     : _my_device{my_device} {}
 
-void *omp_allocator::allocate(size_t min_alignment, size_t size_bytes) {
+void *omp_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
+  if(min_alignment < 32) {
+    // Enforce alignment by default for performance reasons.
+    // 32 is chosen since this is what is currently needed by the adaptivity
+    // engine to consider an allocation strongly aligned.
+    return raw_allocate(32, size_bytes);
+  }
+
 #if !defined(_WIN32)
   // posix requires alignment to be a multiple of sizeof(void*)
   if (min_alignment < sizeof(void*))
@@ -35,11 +42,12 @@ void *omp_allocator::allocate(size_t min_alignment, size_t size_bytes) {
     min_alignment = 1;
 #endif
 
-  if(size_bytes % min_alignment != 0)
-    return nullptr;
+  if(min_alignment > 0 && size_bytes % min_alignment != 0)
+    return raw_allocate(min_alignment,
+                        next_multiple_of(size_bytes, min_alignment));
 
-  // ToDo: Mac OS CI has a problem with std::aligned_alloc
-  // but it's unclear if it's a Mac, or libc++, or toolchain issue
+    // ToDo: Mac OS CI has a problem with std::aligned_alloc
+    // but it's unclear if it's a Mac, or libc++, or toolchain issue
 #ifdef __APPLE__
   return aligned_alloc(min_alignment, size_bytes);
 #elif !defined(_WIN32)
@@ -50,12 +58,12 @@ void *omp_allocator::allocate(size_t min_alignment, size_t size_bytes) {
 #endif
 }
 
-void *omp_allocator::allocate_optimized_host(size_t min_alignment,
+void *omp_allocator::raw_allocate_optimized_host(size_t min_alignment,
                                              size_t bytes) {
-  return this->allocate(min_alignment, bytes);
+  return this->raw_allocate(min_alignment, bytes);
 };
 
-void omp_allocator::free(void *mem) {
+void omp_allocator::raw_free(void *mem) {
 #if !defined(_WIN32)
   std::free(mem);
 #else
@@ -63,8 +71,8 @@ void omp_allocator::free(void *mem) {
 #endif
 }
 
-void* omp_allocator::allocate_usm(size_t bytes) {
-  return this->allocate(0, bytes);
+void* omp_allocator::raw_allocate_usm(size_t bytes) {
+  return this->raw_allocate(0, bytes);
 }
 
 bool omp_allocator::is_usm_accessible_from(backend_descriptor b) const {
@@ -74,6 +82,10 @@ bool omp_allocator::is_usm_accessible_from(backend_descriptor b) const {
   return false;
 }
 
+device_id omp_allocator::get_device() const {
+  return _my_device;
+}
+
 result omp_allocator::query_pointer(const void *ptr, pointer_info &out) const {
   
   // For a host device, USM is the same as host memory?
diff --git a/src/runtime/omp/omp_backend.cpp b/src/runtime/omp/omp_backend.cpp
index 615763aec..c88b85fef 100644
--- a/src/runtime/omp/omp_backend.cpp
+++ b/src/runtime/omp/omp_backend.cpp
@@ -38,14 +38,14 @@ namespace rt {
 
 namespace {
 
-std::unique_ptr<inorder_queue> make_omp_queue(device_id dev) {
-  return std::make_unique<omp_queue>(dev.get_backend());
+std::unique_ptr<inorder_queue> make_omp_queue(omp_backend* be, device_id dev) {
+  return std::make_unique<omp_queue>(be, dev.get_id());
 }
 
 std::unique_ptr<multi_queue_executor>
 create_multi_queue_executor(omp_backend *b) {
-  return std::make_unique<multi_queue_executor>(*b, [](device_id dev) {
-    return make_omp_queue(dev);
+  return std::make_unique<multi_queue_executor>(*b, [b](device_id dev) {
+    return make_omp_queue(b, dev);
   });
 }
 
diff --git a/src/runtime/omp/omp_hardware_manager.cpp b/src/runtime/omp/omp_hardware_manager.cpp
index 0adb67ce9..201d29ad6 100644
--- a/src/runtime/omp/omp_hardware_manager.cpp
+++ b/src/runtime/omp/omp_hardware_manager.cpp
@@ -37,11 +37,11 @@ std::size_t omp_hardware_context::get_max_memcpy_concurrency() const {
 }
 
 std::string omp_hardware_context::get_device_name() const {
-  return "hipSYCL OpenMP host device";
+  return "AdaptiveCpp OpenMP host device";
 }
 
 std::string omp_hardware_context::get_vendor_name() const {
-  return "the hipSYCL project";
+  return "the AdaptiveCpp project";
 }
 
 std::string omp_hardware_context::get_device_arch() const {
@@ -122,6 +122,8 @@ std::size_t
 omp_hardware_context::get_property(device_uint_property prop) const {
   switch (prop) {
   case device_uint_property::max_compute_units:
+    // Do not change this; heuristics in algorithms library
+    // use this.
     return omp_get_num_procs();
     break;
   case device_uint_property::max_global_size0:
@@ -265,6 +267,13 @@ omp_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return std::numeric_limits<std::size_t>::max();
     break;
+  case device_uint_property::architecture:
+    // TODO
+    return 0;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::omp);
+    break;
   }
   assert(false && "Invalid device property");
   return 0;
@@ -287,6 +296,16 @@ std::string omp_hardware_context::get_profile() const {
   return "FULL_PROFILE";
 }
 
+std::size_t omp_hardware_context::get_platform_index() const {
+  return 0;
+}
+
+std::size_t omp_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+
+
 std::size_t omp_hardware_manager::get_num_devices() const { return 1; }
 
 
diff --git a/src/runtime/omp/omp_queue.cpp b/src/runtime/omp/omp_queue.cpp
index 1a65f54dd..9a789e4c6 100644
--- a/src/runtime/omp/omp_queue.cpp
+++ b/src/runtime/omp/omp_queue.cpp
@@ -21,6 +21,7 @@
 #include "hipSYCL/runtime/instrumentation.hpp"
 #include "hipSYCL/runtime/kernel_launcher.hpp"
 #include "hipSYCL/runtime/omp/omp_event.hpp"
+#include "hipSYCL/runtime/omp/omp_backend.hpp"
 #include "hipSYCL/runtime/operations.hpp"
 #include "hipSYCL/runtime/queue_completion_event.hpp"
 #include "hipSYCL/runtime/signal_channel.hpp"
@@ -185,14 +186,36 @@ std::size_t get_page_size() {
 #endif
 }
 
+void *resize_and_align(std::vector<char> &data, std::size_t size,
+                       std::size_t alignment) {
+  data.resize(size + alignment);
+  return reinterpret_cast<void*>(
+        next_multiple_of(reinterpret_cast<std::uint64_t>(data.data()),
+                         alignment));
+}
+
+void *resize_and_strongly_align(std::vector<char> &data, std::size_t size) {
+  // compiler/libkernel builtins assume alignment of at least
+  // 512 byte boundaries
+  std::size_t alignment = std::max(std::size_t{512}, get_page_size());
+  return resize_and_align(data, size, alignment);
+}
+
 result
 launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
                       const rt::range<3> &num_groups,
                       const rt::range<3> &local_size, unsigned shared_memory,
                       void **kernel_args) {
   if (num_groups.size() == 1 && shared_memory == 0) {
+    // still need to be able to support group algorithms
+    // make thread-local in case we have multiple threads submitting.
+    static thread_local std::vector<char> internal_local_memory;
+    auto aligned_internal_local_memory = resize_and_strongly_align(
+        internal_local_memory, local_size.size() * sizeof(uint64_t));
+
     omp_sscp_executable_object::work_group_info info{
-        num_groups, rt::id<3>{0, 0, 0}, local_size, nullptr};
+        num_groups, rt::id<3>{0, 0, 0}, local_size, nullptr,
+        aligned_internal_local_memory};
     kernel(&info, kernel_args);
     return make_success();
   }
@@ -209,11 +232,11 @@ launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
   {
     // get page aligned local memory from heap
     static thread_local std::vector<char> local_memory;
-
-    const auto page_size = get_page_size();
-    local_memory.resize(shared_memory + page_size);
-    auto aligned_local_memory = reinterpret_cast<void*>(next_multiple_of(reinterpret_cast<std::uint64_t>(local_memory.data()), page_size));
-
+    static thread_local std::vector<char> internal_local_memory;
+    auto aligned_local_memory =
+        resize_and_strongly_align(local_memory, shared_memory);
+    auto aligned_internal_local_memory = resize_and_strongly_align(
+        internal_local_memory, local_size.size() * sizeof(uint64_t));
 #ifdef _OPENMP
 #pragma omp for collapse(3)
 #endif
@@ -221,7 +244,8 @@ launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
       for (std::size_t j = 0; j < num_groups.get(1); ++j) {
         for (std::size_t i = 0; i < num_groups.get(0); ++i) {
           omp_sscp_executable_object::work_group_info info{
-              num_groups, rt::id<3>{i, j, k}, local_size, aligned_local_memory};
+              num_groups, rt::id<3>{i, j, k}, local_size, aligned_local_memory,
+              aligned_internal_local_memory};
           kernel(&info, kernel_args);
         }
       }
@@ -232,9 +256,12 @@ launch_kernel_from_so(omp_sscp_executable_object::omp_sscp_kernel *kernel,
 #endif
 } // namespace
 
-omp_queue::omp_queue(backend_id id)
-    : _backend_id(id), _sscp_code_object_invoker{this},
-      _kernel_cache{kernel_cache::get()} {}
+omp_queue::omp_queue(omp_backend* be, int dev)
+    : _backend_id{be->get_unique_backend_id()}, _sscp_code_object_invoker{this},
+      _kernel_cache{kernel_cache::get()} {
+  _reflection_map = glue::jit::construct_default_reflection_map(
+      be->get_hardware_manager()->get_device(dev));
+}
 
 omp_queue::~omp_queue() { _worker.halt(); }
 
@@ -259,96 +286,94 @@ result omp_queue::submit_memcpy(memcpy_operation &op, const dag_node_ptr& node)
   HIPSYCL_DEBUG_INFO << "omp_queue: Submitting memcpy operation..."
                      << std::endl;
 
-  if (op.source().get_device().is_host() && op.dest().get_device().is_host()) {
-
-    void *base_src = op.source().get_base_ptr();
-    void *base_dest = op.dest().get_base_ptr();
+  if (!op.source().get_device().is_host() || !op.dest().get_device().is_host()) {
+    return register_error(
+        __acpp_here(),
+        error_info{"omp_queue: OpenMP CPU backend cannot transfer data between "
+                   "host and accelerators.",
+                   error_type::feature_not_supported});
+  }
 
-    assert(base_src);
-    assert(base_dest);
+  void *base_src = op.source().get_base_ptr();
+  void *base_dest = op.dest().get_base_ptr();
 
-    range<3> transferred_range = op.get_num_transferred_elements();
-    range<3> src_allocation_shape = op.source().get_allocation_shape();
-    range<3> dest_allocation_shape = op.dest().get_allocation_shape();
-    id<3> src_offset = op.source().get_access_offset();
-    id<3> dest_offset = op.dest().get_access_offset();
-    std::size_t src_element_size = op.source().get_element_size();
-    std::size_t dest_element_size = op.dest().get_element_size();
+  assert(base_src);
+  assert(base_dest);
 
-    std::size_t total_num_bytes = op.get_num_transferred_bytes();
+  range<3> transferred_range = op.get_num_transferred_elements();
+  range<3> src_allocation_shape = op.source().get_allocation_shape();
+  range<3> dest_allocation_shape = op.dest().get_allocation_shape();
+  id<3> src_offset = op.source().get_access_offset();
+  id<3> dest_offset = op.dest().get_access_offset();
+  std::size_t src_element_size = op.source().get_element_size();
+  std::size_t dest_element_size = op.dest().get_element_size();
 
-    bool is_src_contiguous =
-        is_contigous(src_offset, transferred_range, src_allocation_shape);
-    bool is_dest_contiguous =
-        is_contigous(dest_offset, transferred_range, dest_allocation_shape);
+  std::size_t total_num_bytes = op.get_num_transferred_bytes();
 
-    omp_instrumentation_setup instrumentation_setup{op, node};
+  bool is_src_contiguous =
+      is_contigous(src_offset, transferred_range, src_allocation_shape);
+  bool is_dest_contiguous =
+      is_contigous(dest_offset, transferred_range, dest_allocation_shape);
 
-    _worker([=]() {
-      auto instrumentation_guard = instrumentation_setup.instrument_task();
+  omp_instrumentation_setup instrumentation_setup{op, node};
 
-      auto linear_index = [](id<3> id, range<3> allocation_shape) {
-        return id[2] + allocation_shape[2] * id[1] +
-               allocation_shape[2] * allocation_shape[1] * id[0];
-      };
+  _worker([=]() {
+    auto instrumentation_guard = instrumentation_setup.instrument_task();
 
-      if (is_src_contiguous && is_dest_contiguous) {
-        char *current_src = reinterpret_cast<char *>(base_src);
-        char *current_dest = reinterpret_cast<char *>(base_dest);
+    auto linear_index = [](id<3> id, range<3> allocation_shape) {
+      return id[2] + allocation_shape[2] * id[1] +
+             allocation_shape[2] * allocation_shape[1] * id[0];
+    };
 
-        current_src +=
-            linear_index(src_offset, src_allocation_shape) * src_element_size;
-        current_dest += linear_index(dest_offset, dest_allocation_shape) *
-                        dest_element_size;
+    if (is_src_contiguous && is_dest_contiguous) {
+      char *current_src = reinterpret_cast<char *>(base_src);
+      char *current_dest = reinterpret_cast<char *>(base_dest);
 
-        memcpy(current_dest, current_src, total_num_bytes);
-      } else {
-        id<3> current_src_offset = src_offset;
-        id<3> current_dest_offset = dest_offset;
-        std::size_t row_size = transferred_range[2] * src_element_size;
+      current_src +=
+          linear_index(src_offset, src_allocation_shape) * src_element_size;
+      current_dest +=
+          linear_index(dest_offset, dest_allocation_shape) * dest_element_size;
 
-        for (std::size_t surface = 0; surface < transferred_range[0];
-             ++surface) {
-          for (std::size_t row = 0; row < transferred_range[1]; ++row) {
+      memcpy(current_dest, current_src, total_num_bytes);
+    } else {
+      id<3> current_src_offset = src_offset;
+      id<3> current_dest_offset = dest_offset;
+      std::size_t row_size = transferred_range[2] * src_element_size;
 
-            char *current_src = reinterpret_cast<char *>(base_src);
-            char *current_dest = reinterpret_cast<char *>(base_dest);
+      for (std::size_t surface = 0; surface < transferred_range[0]; ++surface) {
+        for (std::size_t row = 0; row < transferred_range[1]; ++row) {
 
-            current_src +=
-                linear_index(current_src_offset, src_allocation_shape) *
-                src_element_size;
+          char *current_src = reinterpret_cast<char *>(base_src);
+          char *current_dest = reinterpret_cast<char *>(base_dest);
 
-            current_dest +=
-                linear_index(current_dest_offset, dest_allocation_shape) *
-                dest_element_size;
+          current_src +=
+              linear_index(current_src_offset, src_allocation_shape) *
+              src_element_size;
 
-            assert(current_src + row_size <=
-                   reinterpret_cast<char *>(base_src) +
-                       src_allocation_shape.size() * src_element_size);
-            assert(current_dest + row_size <=
-                   reinterpret_cast<char *>(base_dest) +
-                       dest_allocation_shape.size() * dest_element_size);
+          current_dest +=
+              linear_index(current_dest_offset, dest_allocation_shape) *
+              dest_element_size;
 
-            memcpy(current_dest, current_src, row_size);
+          assert(current_src + row_size <=
+                 reinterpret_cast<char *>(base_src) +
+                     src_allocation_shape.size() * src_element_size);
+          assert(current_dest + row_size <=
+                 reinterpret_cast<char *>(base_dest) +
+                     dest_allocation_shape.size() * dest_element_size);
 
-            ++current_src_offset[1];
-            ++current_dest_offset[1];
-          }
-          current_src_offset[1] = src_offset[1];
-          current_dest_offset[1] = dest_offset[1];
+          memcpy(current_dest, current_src, row_size);
 
-          ++current_dest_offset[0];
-          ++current_src_offset[0];
+          ++current_src_offset[1];
+          ++current_dest_offset[1];
         }
+        current_src_offset[1] = src_offset[1];
+        current_dest_offset[1] = dest_offset[1];
+
+        ++current_dest_offset[0];
+        ++current_src_offset[0];
       }
-    });
-  } else {
-    return register_error(
-        __acpp_here(),
-        error_info{"omp_queue: OpenMP CPU backend cannot transfer data between "
-                   "host and accelerators.",
-                   error_type::feature_not_supported});
-  }
+    }
+  });
 
   return make_success();
 }
@@ -361,7 +386,7 @@ result omp_queue::submit_kernel(kernel_operation &op, const dag_node_ptr& node)
 
   const kernel_configuration *config =
       &(op.get_launcher().get_kernel_configuration());
-  
+
   auto backend_id = _backend_id;
   void* params = this;
   rt::dag_node* node_ptr = node.get();
@@ -409,7 +434,7 @@ result omp_queue::submit_sscp_kernel_from_code_object(
       group_size, args,        arg_sizes,   num_args, local_mem_size};
 
   _config = initial_config;
-  
+
   _config.append_base_configuration(
       kernel_base_config_parameter::backend_id, backend_id::omp);
   _config.append_base_configuration(
@@ -439,7 +464,7 @@ result omp_queue::submit_sscp_kernel_from_code_object(
 
     // Lower kernels to binary
     auto err = glue::jit::compile(translator.get(), hcf, selected_image_name,
-                                  _config, compiled_image);
+                                  _config, _reflection_map, compiled_image);
 
     if (!err.is_success()) {
       register_error(err);
diff --git a/src/runtime/runtime_event_handlers.cpp b/src/runtime/runtime_event_handlers.cpp
new file mode 100644
index 000000000..6e56c63f8
--- /dev/null
+++ b/src/runtime/runtime_event_handlers.cpp
@@ -0,0 +1,42 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+
+#include "hipSYCL/runtime/runtime_event_handlers.hpp"
+#include "hipSYCL/runtime/allocation_tracker.hpp"
+#include "hipSYCL/runtime/application.hpp"
+#include "hipSYCL/runtime/settings.hpp"
+
+namespace hipsycl {
+namespace rt {
+
+runtime_event_handlers::runtime_event_handlers() {
+  _needs_allocation_tracking = application::get_settings().get<
+    setting::enable_allocation_tracking>();
+}
+
+void runtime_event_handlers::on_new_allocation(const void *ptr,
+                                               std::size_t size,
+                                               const allocation_info &info) {
+  if (_needs_allocation_tracking) {
+    allocation_tracker::register_allocation(ptr, size, info);
+  }
+}
+
+
+void runtime_event_handlers::on_deallocation(const void* ptr) {
+  if (_needs_allocation_tracking) {
+    allocation_tracker::unregister_allocation(ptr);
+  }
+}
+
+}
+}
diff --git a/src/runtime/ze/ze_allocator.cpp b/src/runtime/ze/ze_allocator.cpp
index e3567f842..4b870c83a 100644
--- a/src/runtime/ze/ze_allocator.cpp
+++ b/src/runtime/ze/ze_allocator.cpp
@@ -10,6 +10,7 @@
 // SPDX-License-Identifier: BSD-2-Clause
 #include <level_zero/ze_api.h>
 
+#include "hipSYCL/runtime/device_id.hpp"
 #include "hipSYCL/runtime/ze/ze_allocator.hpp"
 #include "hipSYCL/runtime/error.hpp"
 #include "hipSYCL/runtime/util.hpp"
@@ -17,13 +18,18 @@
 namespace hipsycl {
 namespace rt {
 
-ze_allocator::ze_allocator(const ze_hardware_context *device,
+ze_allocator::ze_allocator(std::size_t device_index,
+                           const ze_hardware_context *device,
                            const ze_hardware_manager *hw_manager)
     : _ctx{device->get_ze_context()}, _dev{device->get_ze_device()},
       _global_mem_ordinal{device->get_ze_global_memory_ordinal()},
-      _hw_manager{hw_manager} {}
+      _hw_manager{hw_manager} {
+  _dev_id = device_id{backend_descriptor{hardware_platform::level_zero,
+                                         api_platform::level_zero},
+                      static_cast<int>(device_index)};
+}
 
-void* ze_allocator::allocate(size_t min_alignment, size_t size_bytes) {
+void* ze_allocator::raw_allocate(size_t min_alignment, size_t size_bytes) {
   
   void* out = nullptr;
 
@@ -47,8 +53,8 @@ void* ze_allocator::allocate(size_t min_alignment, size_t size_bytes) {
   return out;
 }
 
-void* ze_allocator::allocate_optimized_host(size_t min_alignment,
-                                            size_t bytes) {
+void* ze_allocator::raw_allocate_optimized_host(size_t min_alignment,
+                                                size_t bytes) {
   void* out = nullptr;
   ze_host_mem_alloc_desc_t desc;
   
@@ -69,7 +75,7 @@ void* ze_allocator::allocate_optimized_host(size_t min_alignment,
   return out;
 }
   
-void ze_allocator::free(void *mem) {
+void ze_allocator::raw_free(void *mem) {
   ze_result_t err = zeMemFree(_ctx, mem);
 
   if(err != ZE_RESULT_SUCCESS) {
@@ -79,7 +85,7 @@ void ze_allocator::free(void *mem) {
   }
 }
 
-void* ze_allocator::allocate_usm(size_t bytes) {
+void* ze_allocator::raw_allocate_usm(size_t bytes) {
 
   void* out = nullptr;
 
@@ -167,5 +173,9 @@ result ze_allocator::mem_advise(const void *addr, std::size_t num_bytes,
   return make_success();
 }
 
+device_id ze_allocator::get_device() const {
+  return _dev_id;
+}
+
 }
 }
diff --git a/src/runtime/ze/ze_backend.cpp b/src/runtime/ze/ze_backend.cpp
index c49334a9e..1cfac7aab 100644
--- a/src/runtime/ze/ze_backend.cpp
+++ b/src/runtime/ze/ze_backend.cpp
@@ -58,7 +58,7 @@ ze_backend::ze_backend() {
 
   _hardware_manager = std::make_unique<ze_hardware_manager>();
   for(std::size_t i = 0; i < _hardware_manager->get_num_devices(); ++i) {
-    _allocators.push_back(ze_allocator{
+    _allocators.push_back(ze_allocator{i,
         static_cast<ze_hardware_context *>(_hardware_manager->get_device(i)),
         _hardware_manager.get()});
   }
diff --git a/src/runtime/ze/ze_hardware_manager.cpp b/src/runtime/ze/ze_hardware_manager.cpp
index 49511542c..8fb4e9971 100644
--- a/src/runtime/ze/ze_hardware_manager.cpp
+++ b/src/runtime/ze/ze_hardware_manager.cpp
@@ -449,6 +449,13 @@ std::size_t ze_hardware_context::get_property(device_uint_property prop) const {
   case device_uint_property::vendor_id:
     return _props.vendorId;
     break;
+  case device_uint_property::architecture:
+    // TODO
+    return 0;
+    break;
+  case device_uint_property::backend_id:
+    return static_cast<int>(backend_id::level_zero);
+    break;
   }
   assert(false && "Invalid device property");
   std::terminate();
@@ -505,6 +512,15 @@ uint32_t ze_hardware_context::get_ze_global_memory_ordinal() const {
   return result;
 }
 
+std::size_t ze_hardware_context::get_platform_index() const {
+  return 0;
+}
+
+std::size_t ze_hardware_manager::get_num_platforms() const {
+  return 1;
+}
+
+
 ze_hardware_manager::ze_hardware_manager() {
 
   if (has_device_visibility_mask(
diff --git a/src/runtime/ze/ze_queue.cpp b/src/runtime/ze/ze_queue.cpp
index 2ce9a774b..db281ad0f 100644
--- a/src/runtime/ze/ze_queue.cpp
+++ b/src/runtime/ze/ze_queue.cpp
@@ -149,6 +149,8 @@ ze_queue::ze_queue(ze_hardware_manager *hw_manager, std::size_t device_index)
 
   ze_hardware_context *hw_context =
       cast<ze_hardware_context>(hw_manager->get_device(device_index));
+    
+  _reflection_map = glue::jit::construct_default_reflection_map(hw_context);
   
   assert(hw_context);
 
@@ -504,10 +506,10 @@ result ze_queue::submit_sscp_kernel_from_code_object(
     if(kernel_names.size() == 1) {
       err = glue::jit::dead_argument_elimination::compile_kernel(
           translator.get(), hcf_object, selected_image_name, _config,
-          binary_configuration_id, compiled_image);
+          binary_configuration_id, _reflection_map, compiled_image);
     } else {
       err = glue::jit::compile(translator.get(),
-        hcf_object, selected_image_name, _config, compiled_image);
+        hcf_object, selected_image_name, _config, _reflection_map, compiled_image);
     }
     
     if(!err.is_success()) {
diff --git a/src/tools/acpp-info/acpp-info.cpp b/src/tools/acpp-info/acpp-info.cpp
index a469af8b1..02d2f8fd3 100644
--- a/src/tools/acpp-info/acpp-info.cpp
+++ b/src/tools/acpp-info/acpp-info.cpp
@@ -70,6 +70,11 @@ void list_device_details(rt::device_id dev, rt::backend *b,
   std::cout << " General device information:" << std::endl;
   print_info("Name", hw->get_device_name(), 2);
   print_info("Backend", b->get_name(), 2);
+  print_info("Platform",
+             "Backend " +
+                 std::to_string(static_cast<int>(b->get_unique_backend_id())) +
+                 " / Platform " + std::to_string(hw->get_platform_index()),
+             2);
   print_info("Vendor", hw->get_vendor_name(), 2);
   print_info("Arch", hw->get_device_arch(), 2);
   print_info("Driver version", hw->get_driver_version(), 2);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 64f58ea27..f35961e62 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ project(adaptivecpp-tests)
 set(Boost_USE_STATIC_LIBS off)
 set(BUILD_SHARED_LIBS on)
 set(REDUCED_LOCAL_MEM_USAGE OFF CACHE BOOL "Only run tests with reduced local memory usage to allow running on hardware with little local memory.")
+set(ACPP_TEST_WORK_GROUP_SHUFFLE_EXT OFF CACHE BOOL "Enable work group shuffles tests that are an AdaptiveCpp extension.")
 
 find_package(Boost COMPONENTS unit_test_framework REQUIRED)
 
@@ -39,6 +40,9 @@ if(REDUCED_LOCAL_MEM_USAGE)
   add_definitions(-DREDUCED_LOCAL_MEM_USAGE)
 endif()
 
+if(ACPP_TEST_WORK_GROUP_SHUFFLE_EXT)
+  add_definitions(-DACPP_TEST_WORK_GROUP_SHUFFLE_EXT)
+endif()
 
 #Use add_definitions for now for older cmake versions
 cmake_policy(SET CMP0005 NEW)
@@ -134,21 +138,27 @@ if(WITH_PSTL_TESTS)
     pstl/copy.cpp
     pstl/copy_if.cpp
     pstl/copy_n.cpp
+    pstl/exclusive_scan.cpp
     pstl/fill.cpp
     pstl/fill_n.cpp
     pstl/for_each.cpp
     pstl/for_each_n.cpp
     pstl/generate.cpp
     pstl/generate_n.cpp
+    pstl/inclusive_scan.cpp
     pstl/memory.cpp
+    pstl/merge.cpp
     pstl/none_of.cpp
     pstl/reduce.cpp
     pstl/replace.cpp
     pstl/replace_if.cpp
     pstl/replace_copy.cpp
     pstl/replace_copy_if.cpp
+    pstl/sort.cpp
     pstl/transform.cpp
     pstl/transform_reduce.cpp
+    pstl/transform_inclusive_scan.cpp
+    pstl/transform_exclusive_scan.cpp
     pstl/pointer_validation.cpp
     pstl/allocation_map.cpp
     pstl/free_space_map.cpp)
diff --git a/tests/compiler/cbs/accumulator_for.cpp b/tests/compiler/cbs/accumulator_for.cpp
index 1ce08366f..43946df8f 100644
--- a/tests/compiler/cbs/accumulator_for.cpp
+++ b/tests/compiler/cbs/accumulator_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/add_modulo.cpp b/tests/compiler/cbs/add_modulo.cpp
index 718c413f8..729e5135e 100644
--- a/tests/compiler/cbs/add_modulo.cpp
+++ b/tests/compiler/cbs/add_modulo.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/cond_between_barriers.cpp b/tests/compiler/cbs/cond_between_barriers.cpp
index 2e1e815cf..63d8cf045 100644
--- a/tests/compiler/cbs/cond_between_barriers.cpp
+++ b/tests/compiler/cbs/cond_between_barriers.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/conds.cpp b/tests/compiler/cbs/conds.cpp
index 5515d46eb..6e7223791 100644
--- a/tests/compiler/cbs/conds.cpp
+++ b/tests/compiler/cbs/conds.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/conds_in_for.cpp b/tests/compiler/cbs/conds_in_for.cpp
index f9c0e27d7..60ded21af 100644
--- a/tests/compiler/cbs/conds_in_for.cpp
+++ b/tests/compiler/cbs/conds_in_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/const_init_accumulator_for.cpp b/tests/compiler/cbs/const_init_accumulator_for.cpp
index fade89530..27bcc104d 100644
--- a/tests/compiler/cbs/const_init_accumulator_for.cpp
+++ b/tests/compiler/cbs/const_init_accumulator_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/for_in_cond.cpp b/tests/compiler/cbs/for_in_cond.cpp
index 0c9b2efa1..8e84d0326 100644
--- a/tests/compiler/cbs/for_in_cond.cpp
+++ b/tests/compiler/cbs/for_in_cond.cpp
@@ -3,6 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/group_barrier.cpp b/tests/compiler/cbs/group_barrier.cpp
index edcf24107..9636529ec 100644
--- a/tests/compiler/cbs/group_barrier.cpp
+++ b/tests/compiler/cbs/group_barrier.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/item_dependent_cond_in_for.cpp b/tests/compiler/cbs/item_dependent_cond_in_for.cpp
index d10a0c48c..5cbeacf39 100644
--- a/tests/compiler/cbs/item_dependent_cond_in_for.cpp
+++ b/tests/compiler/cbs/item_dependent_cond_in_for.cpp
@@ -3,6 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/item_dependent_for.cpp b/tests/compiler/cbs/item_dependent_for.cpp
index 5a9141ffb..5e43d2f4f 100644
--- a/tests/compiler/cbs/item_dependent_for.cpp
+++ b/tests/compiler/cbs/item_dependent_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/multiple_indvars_for.cpp b/tests/compiler/cbs/multiple_indvars_for.cpp
index 51a4e8bc4..500659e5e 100644
--- a/tests/compiler/cbs/multiple_indvars_for.cpp
+++ b/tests/compiler/cbs/multiple_indvars_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/no_barriers.cpp b/tests/compiler/cbs/no_barriers.cpp
index 9ee651958..bae267194 100644
--- a/tests/compiler/cbs/no_barriers.cpp
+++ b/tests/compiler/cbs/no_barriers.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/reduce_const_for.cpp b/tests/compiler/cbs/reduce_const_for.cpp
index 322bdfe08..375c511a7 100644
--- a/tests/compiler/cbs/reduce_const_for.cpp
+++ b/tests/compiler/cbs/reduce_const_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_do_while.cpp b/tests/compiler/cbs/reduce_do_while.cpp
index 11eee1716..f09c61339 100644
--- a/tests/compiler/cbs/reduce_do_while.cpp
+++ b/tests/compiler/cbs/reduce_do_while.cpp
@@ -3,6 +3,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_for.cpp b/tests/compiler/cbs/reduce_for.cpp
index b483c0323..c3444764a 100644
--- a/tests/compiler/cbs/reduce_for.cpp
+++ b/tests/compiler/cbs/reduce_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <iostream>
 
diff --git a/tests/compiler/cbs/reduce_for_inverse_barrier.cpp b/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
index 728880a38..edc33d2ea 100644
--- a/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
+++ b/tests/compiler/cbs/reduce_for_inverse_barrier.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_nested_for.cpp b/tests/compiler/cbs/reduce_nested_for.cpp
index 14e87a679..f8d84bb1b 100644
--- a/tests/compiler/cbs/reduce_nested_for.cpp
+++ b/tests/compiler/cbs/reduce_nested_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_unrolled.cpp b/tests/compiler/cbs/reduce_unrolled.cpp
index 0ac46b4c5..ecf4763ac 100644
--- a/tests/compiler/cbs/reduce_unrolled.cpp
+++ b/tests/compiler/cbs/reduce_unrolled.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_while.cpp b/tests/compiler/cbs/reduce_while.cpp
index b5a761ad0..56f28d8b9 100644
--- a/tests/compiler/cbs/reduce_while.cpp
+++ b/tests/compiler/cbs/reduce_while.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/reduce_while_early_update.cpp b/tests/compiler/cbs/reduce_while_early_update.cpp
index 2785be3fb..a14e08ddc 100644
--- a/tests/compiler/cbs/reduce_while_early_update.cpp
+++ b/tests/compiler/cbs/reduce_while_early_update.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/right_heavy_cond.cpp b/tests/compiler/cbs/right_heavy_cond.cpp
index 5d35cec19..ff8b6f327 100644
--- a/tests/compiler/cbs/right_heavy_cond.cpp
+++ b/tests/compiler/cbs/right_heavy_cond.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/simple_kernel.cpp b/tests/compiler/cbs/simple_kernel.cpp
index a7be8d9f5..862696cab 100644
--- a/tests/compiler/cbs/simple_kernel.cpp
+++ b/tests/compiler/cbs/simple_kernel.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/stencil.cpp b/tests/compiler/cbs/stencil.cpp
index 064e97b19..f0f3b954d 100644
--- a/tests/compiler/cbs/stencil.cpp
+++ b/tests/compiler/cbs/stencil.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/cbs/sycl_dgemm.cpp b/tests/compiler/cbs/sycl_dgemm.cpp
index 3b2d3a72b..2ed1683eb 100644
--- a/tests/compiler/cbs/sycl_dgemm.cpp
+++ b/tests/compiler/cbs/sycl_dgemm.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 // adapted from https://github.com/UoB-HPC/sycl_dgemm/blob/main/dgemm.cpp
 
diff --git a/tests/compiler/cbs/two_barrier_for.cpp b/tests/compiler/cbs/two_barrier_for.cpp
index 436580779..168155932 100644
--- a/tests/compiler/cbs/two_barrier_for.cpp
+++ b/tests/compiler/cbs/two_barrier_for.cpp
@@ -2,6 +2,10 @@
 // RUN: %t | FileCheck %s
 // RUN: %acpp %s -o %t --acpp-targets=omp --acpp-use-accelerated-cpu -O
 // RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O
+// RUN: ACPP_VISIBILITY_MASK=omp; %t | FileCheck %s
 
 #include <array>
 #include <iostream>
diff --git a/tests/compiler/sscp/dynamic_function.cpp b/tests/compiler/sscp/dynamic_function.cpp
index 076c01f41..0377050f1 100644
--- a/tests/compiler/sscp/dynamic_function.cpp
+++ b/tests/compiler/sscp/dynamic_function.cpp
@@ -21,7 +21,7 @@ SYCL_EXTERNAL void myfunction2(int* data, sycl::item<1> idx) {
 
 __attribute__((noinline))
 void execute_operations_with_definition(int* data, sycl::item<1> idx) {
-  sycl::jit::arguments_are_used(data, idx);
+  sycl::AdaptiveCpp_jit::arguments_are_used(data, idx);
 }
 
 void execute_operations_without_definition(int* data, sycl::item<1> idx);
@@ -34,7 +34,7 @@ int main() {
   {
     *data = 0;
   
-    sycl::jit::dynamic_function_config dyn_function_config;
+    sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
     dyn_function_config.define(&execute_operations_without_definition, &myfunction1);
     q.parallel_for(sycl::range{1}, dyn_function_config.apply([=](sycl::item<1> idx){
       execute_operations_without_definition(data, idx);
@@ -48,7 +48,7 @@ int main() {
   {
     *data = 0;
   
-    sycl::jit::dynamic_function_config dyn_function_config;
+    sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
     dyn_function_config.define(&execute_operations_without_definition, &myfunction1);
     q.parallel_for(sycl::range{1}, dyn_function_config.apply([=](sycl::item<1> idx){
       execute_operations_without_definition(data, idx);
@@ -62,7 +62,7 @@ int main() {
   {
     *data = 0;
   
-    sycl::jit::dynamic_function_config dyn_function_config;
+    sycl::AdaptiveCpp_jit::dynamic_function_config dyn_function_config;
     dyn_function_config.define_as_call_sequence(&execute_operations_without_definition, {&myfunction1, &myfunction2});
     q.parallel_for(sycl::range{1}, dyn_function_config.apply([=](sycl::item<1> idx){
       execute_operations_without_definition(data, idx);
diff --git a/tests/compiler/sscp/export-all/export_all.cpp b/tests/compiler/sscp/export-all/export_all.cpp
new file mode 100644
index 000000000..950649327
--- /dev/null
+++ b/tests/compiler/sscp/export-all/export_all.cpp
@@ -0,0 +1,26 @@
+// RUN: %acpp %s %S/second_tu.cpp -o %t --acpp-targets=generic --acpp-export-all
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s %S/second_tu.cpp -o %t --acpp-targets=generic -O3 --acpp-export-all
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s %S/second_tu.cpp -o %t --acpp-targets=generic -g --acpp-export-all
+// RUN: %t | FileCheck %s
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+#include "../common.hpp"
+
+// defined in second_tu.cpp
+int increment(int x);
+
+int main() {
+  sycl::queue q = get_queue();
+  int* data = sycl::malloc_shared<int>(1, q);
+  q.single_task([=](){
+    *data = increment(123);
+  });
+  q.wait();
+
+  // CHECK: 124
+  std::cout << *data << std::endl;
+  sycl::free(data, q);
+}
diff --git a/tests/compiler/sscp/export-all/lit.local.cfg b/tests/compiler/sscp/export-all/lit.local.cfg
new file mode 100644
index 000000000..0939c330d
--- /dev/null
+++ b/tests/compiler/sscp/export-all/lit.local.cfg
@@ -0,0 +1 @@
+config.excludes = ["second_tu.cpp"]
\ No newline at end of file
diff --git a/tests/compiler/sscp/export-all/second_tu.cpp b/tests/compiler/sscp/export-all/second_tu.cpp
new file mode 100644
index 000000000..d6b52a9b4
--- /dev/null
+++ b/tests/compiler/sscp/export-all/second_tu.cpp
@@ -0,0 +1,5 @@
+#include <sycl/sycl.hpp>
+
+int increment(int x) {
+  return x+1;
+}
\ No newline at end of file
diff --git a/tests/compiler/sscp/s2_reflection.cpp b/tests/compiler/sscp/s2_reflection.cpp
new file mode 100644
index 000000000..92295d99a
--- /dev/null
+++ b/tests/compiler/sscp/s2_reflection.cpp
@@ -0,0 +1,88 @@
+
+// RUN: %acpp %s -o %t --acpp-targets=generic
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O3
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -O3 -ffast-math
+// RUN: %t | FileCheck %s
+// RUN: %acpp %s -o %t --acpp-targets=generic -g
+// RUN: %t | FileCheck %s
+
+#include <iostream>
+#include <cmath>
+#include <sycl/sycl.hpp>
+#include "common.hpp"
+#include "hipSYCL/runtime/hardware.hpp"
+
+extern "C" bool __acpp_sscp_jit_reflect_knows_random_unknown_thing();
+
+int main() {
+  sycl::queue q = get_queue();
+  int* data = sycl::malloc_shared<int>(7, q);
+
+  q.single_task([data]{
+    __acpp_if_target_device(
+      data[0] = __acpp_sscp_jit_reflect_runtime_backend();
+      data[1] = __acpp_sscp_jit_reflect_target_arch();
+      data[2] = __acpp_sscp_jit_reflect_target_is_cpu();
+      data[3] = static_cast<int>(__acpp_sscp_jit_reflect_compiler_backend());
+      data[4] = __acpp_sscp_jit_reflect_target_vendor_id();
+
+      data[5] = __acpp_sscp_jit_reflect_knows_runtime_backend();
+      data[6] = __acpp_sscp_jit_reflect_knows_random_unknown_thing();
+    );
+  }).wait();
+
+  auto dev = q.get_device().AdaptiveCpp_device_id();
+  hipsycl::rt::runtime_keep_alive_token rt;
+  hipsycl::rt::hardware_context *ctx = rt.get()
+                                           ->backends()
+                                           .get(dev.get_backend())
+                                           ->get_hardware_manager()
+                                           ->get_device(dev.get_id());
+
+  // CHECK: 1
+  std::cout << (data[0] == static_cast<int>(dev.get_backend())) << std::endl;
+  // CHECK: 1
+  std::cout << (data[1] ==
+                static_cast<int>(ctx->get_property(
+                    hipsycl::rt::device_uint_property::architecture)))
+            << std::endl;
+  // CHECK: 1
+  std::cout << (data[2] == static_cast<int>(ctx->is_cpu())) << std::endl;
+
+  // We don't have a mechanism yet to query compiler backends on the host, so
+  // cannot test data[3] for now.
+
+  // CHECK: 1
+  std::cout << (data[4] ==
+                static_cast<int>(ctx->get_property(
+                    hipsycl::rt::device_uint_property::vendor_id))) << std::endl;
+
+  // CHECK: 1
+  std::cout << data[5] << std::endl;
+  // CHECK: 0
+  std::cout << data[6] << std::endl;
+
+  q.single_task([=]() {
+    __acpp_if_target_device(
+      auto backend = sycl::AdaptiveCpp_jit::reflect<
+          sycl::AdaptiveCpp_jit::reflection_query::runtime_backend>();
+      data[0] = sycl::AdaptiveCpp_jit::compile_if_else(
+          backend == static_cast<int>(sycl::backend::omp), 
+          []() { return 1; },
+          []() { return 0; });
+      data[1] = sycl::AdaptiveCpp_jit::knows<
+          sycl::AdaptiveCpp_jit::reflection_query::runtime_backend>();
+    );
+  }).wait();
+  // CHECK: 1
+  std::cout << (data[0] == (q.get_device().get_backend() ==
+                            sycl::backend::omp))
+            << std::endl;
+
+  // CHECK: 1
+  std::cout << data[1] << std::endl;
+
+  sycl::free(data, q);
+}
diff --git a/tests/pstl/allocation_map.cpp b/tests/pstl/allocation_map.cpp
index 3241eb877..dbc96fe8f 100644
--- a/tests/pstl/allocation_map.cpp
+++ b/tests/pstl/allocation_map.cpp
@@ -24,7 +24,8 @@
 
 BOOST_AUTO_TEST_SUITE(pstl_allocation_map)
 
-using amap_t = hipsycl::stdpar::allocation_map<>;
+struct payload{};
+using amap_t = hipsycl::common::allocation_map<payload>;
 
 template<class F>
 void for_each_test_allocation(std::size_t n, F&& f) {
diff --git a/tests/pstl/copy_if.cpp b/tests/pstl/copy_if.cpp
index eacce1e8e..f58d4ad65 100644
--- a/tests/pstl/copy_if.cpp
+++ b/tests/pstl/copy_if.cpp
@@ -35,13 +35,12 @@ void test_copy_if(std::size_t problem_size, Generator&& gen) {
 
   auto ret = std::copy_if(std::execution::par_unseq, data.begin(), data.end(),
                           dest_device.begin(), p);
-  std::copy_if(data.begin(), data.end(), dest_host.begin(), p);
+  auto ret_reference = std::copy_if(data.begin(), data.end(), dest_host.begin(), p);
 
-  BOOST_CHECK(ret == dest_device.begin() + problem_size);
-  // Our copy_if implementation is currently incorrect, since
-  // we always copy results to the same position (we would
-  // actually need to run a scan algorithm to find the right place)
-  //BOOST_CHECK(dest_device == dest_host);
+  BOOST_CHECK(std::distance(dest_device.begin(), ret) ==
+              std::distance(dest_host.begin(), ret_reference));
+
+  BOOST_CHECK(dest_device == dest_host);
 }
 
 BOOST_AUTO_TEST_CASE(par_unseq_empty) {
diff --git a/tests/pstl/exclusive_scan.cpp b/tests/pstl/exclusive_scan.cpp
new file mode 100644
index 000000000..592b3b049
--- /dev/null
+++ b/tests/pstl/exclusive_scan.cpp
@@ -0,0 +1,192 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_exclusive_scan, enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
+    return t;
+  }
+
+  T get() const {
+    return data[0];
+  }
+private:
+  non_default_constructible(){}
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < size; ++i)
+    data.push_back(gen(i));
+
+  std::vector<T> reference0;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    reference0 = data;
+    std::exclusive_scan(data.begin(), data.end(), reference0.begin(), init);
+  }
+
+  std::vector<T> reference1 = data;
+  std::exclusive_scan(data.begin(), data.end(), reference1.begin(), init, op);
+
+  std::vector<T> device_result0 = data;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(std::exclusive_scan(pol, data.begin(), data.end(),
+                                    device_result0.begin(), init) ==
+                device_result0.end());
+  }
+
+  std::vector<T> device_result1 = data;
+    BOOST_CHECK(std::exclusive_scan(pol, data.begin(), data.end(),
+                                    device_result1.begin(), init, op) ==
+                device_result1.end());
+  
+  
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(reference0 == device_result0);
+  }
+  BOOST_CHECK(reference1 == device_result1);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  /*using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 1024>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);*/
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/pstl/free_space_map.cpp b/tests/pstl/free_space_map.cpp
index 894febd9f..960368969 100644
--- a/tests/pstl/free_space_map.cpp
+++ b/tests/pstl/free_space_map.cpp
@@ -24,7 +24,9 @@
 
 BOOST_AUTO_TEST_SUITE(pstl_free_space_map)
 
-using amap_t = hipsycl::stdpar::allocation_map<>;
+struct payload {};
+
+using amap_t = hipsycl::stdpar::allocation_map<payload>;
 using fmap_t = hipsycl::stdpar::free_space_map;
 
 uint64_t next_pow2(uint64_t x) {
diff --git a/tests/pstl/inclusive_scan.cpp b/tests/pstl/inclusive_scan.cpp
new file mode 100644
index 000000000..942c510b5
--- /dev/null
+++ b/tests/pstl/inclusive_scan.cpp
@@ -0,0 +1,200 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_inclusive_scan, enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
+    return t;
+  }
+
+  T get() const {
+    return data[0];
+  }
+private:
+  non_default_constructible(){}
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < size; ++i)
+    data.push_back(gen(i));
+
+  std::vector<T> reference0;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    reference0 = data;
+    std::inclusive_scan(data.begin(), data.end(), reference0.begin());
+  }
+
+  std::vector<T> reference1 = data;
+  std::inclusive_scan(data.begin(), data.end(), reference1.begin(), op);
+
+  std::vector<T> reference2 = data;
+  std::inclusive_scan(data.begin(), data.end(), reference2.begin(), op, init);
+
+  std::vector<T> device_result0 = data;
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result0.begin()) ==
+                device_result0.end());
+  }
+
+  std::vector<T> device_result1 = data;
+    BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result1.begin(), op) ==
+                device_result1.end());
+
+  std::vector<T> device_result2 = data;
+    BOOST_CHECK(std::inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result2.begin(), op, init) ==
+                device_result2.end());
+  
+  if constexpr(std::is_same_v<BinOp, std::plus<>>) {
+    BOOST_CHECK(reference0 == device_result0);
+  }
+  BOOST_CHECK(reference1 == device_result1);
+  BOOST_CHECK(reference2 == device_result2);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  /*using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 4>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);*/
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/pstl/merge.cpp b/tests/pstl/merge.cpp
new file mode 100644
index 000000000..fd741946a
--- /dev/null
+++ b/tests/pstl/merge.cpp
@@ -0,0 +1,146 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <algorithm>
+#include <cstdlib>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <functional>
+#include <random>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/mpl/list.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_merge, enable_unified_shared_memory)
+
+template <class Policy, class Generator1, class Generator2,
+          class Comp = std::less<>>
+void test_merge(Policy &&pol, std::size_t size1, std::size_t size2,
+                Generator1 gen1, Generator2 gen2, Comp comp = {}) {
+  std::vector<int> data1(size1);
+  std::vector<int> data2(size2);
+  std::vector<int> out(size1+size2);
+
+  for(int i = 0; i < size1; ++i)
+    data1[i] = gen1(i);
+  for(int i = 0; i < size2; ++i)
+    data2[i] = gen2(i);
+  std::vector<int> host_out = out;
+
+  auto ret = std::merge(pol, data1.begin(), data1.end(), data2.begin(), data2.end(),
+             out.begin(), comp);
+  auto host_ret = std::merge(data1.begin(), data1.end(), data2.begin(), data2.end(),
+             host_out.begin(), comp);
+
+  BOOST_CHECK(host_out == out);
+  BOOST_CHECK(ret == out.begin() + std::distance(host_out.begin(), host_ret));
+
+  for(int i = 0; i < out.size(); ++i) {
+    auto expected = host_out[i];
+    auto result = out[i];
+    if(result != expected)
+      std::cout << i << ": " << expected << " != " << result << std::endl;
+  }
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  test_merge(
+      std::execution::par_unseq, 0, 0, [](int i) { return 0; },
+      [](int i) { return 0; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  test_merge(
+      std::execution::par_unseq, 1, 0, [](int i) { return 2; },
+      [](int i) { return 3; });
+  test_merge(
+      std::execution::par_unseq, 0, 1, [](int i) { return 2; },
+      [](int i) { return 3; });
+  test_merge(
+      std::execution::par_unseq, 1, 1, [](int i) { return 2; },
+      [](int i) { return 3; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_trivial_merge) {
+
+  auto a = [](int i) { return i; };
+  auto b = [](int i) { return i + 1024; };
+
+  test_merge(
+      std::execution::par_unseq, 1024, 1024, a, b);
+
+  test_merge(
+      std::execution::par_unseq, 1024, 1024, b, a);
+}
+
+std::vector<int> generate_sorted_random_numbers(int amount, int seed=123) {
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<int> dist;
+
+  std::vector<int> data;
+  for(int i = 0; i < amount; ++i)
+    data.push_back(dist(gen));
+  std::sort(data.begin(), data.end());
+  return data;
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_same_size) {
+  std::size_t s1 = 256;
+  std::size_t s2 = 256;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 42);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_same_data) {
+  std::size_t s1 = 1024;
+  std::size_t s2 = 1024;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 123);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_v1_larger) {
+  std::size_t s1 = 1932;
+  std::size_t s2 = 1000;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 42);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_v2_larger) {
+  std::size_t s1 = 1000;
+  std::size_t s2 = 1932;
+
+  auto v1 = generate_sorted_random_numbers(s1, 123);
+  auto v2 = generate_sorted_random_numbers(s2, 42);
+
+  test_merge(
+      std::execution::par_unseq, s1, s2, [&](int i) { return v1[i]; },
+      [&](int i) { return v2[i]; });
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/pstl/pstl_test_suite.hpp b/tests/pstl/pstl_test_suite.hpp
index 322696498..dcbfa1975 100644
--- a/tests/pstl/pstl_test_suite.hpp
+++ b/tests/pstl/pstl_test_suite.hpp
@@ -15,11 +15,15 @@
 
 struct enable_unified_shared_memory {
   enable_unified_shared_memory() {
+#ifndef __ACPP_STDPAR_ASSUME_SYSTEM_USM__
     hipsycl::stdpar::unified_shared_memory::pop_disabled();
+#endif
   }
 
   ~enable_unified_shared_memory() {
+#ifndef __ACPP_STDPAR_ASSUME_SYSTEM_USM__
     hipsycl::stdpar::unified_shared_memory::push_disabled();
+#endif
   }
 };
 
diff --git a/tests/pstl/sort.cpp b/tests/pstl/sort.cpp
new file mode 100644
index 000000000..4fb4f7b66
--- /dev/null
+++ b/tests/pstl/sort.cpp
@@ -0,0 +1,65 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <algorithm>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <functional>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/mpl/list.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_sort, enable_unified_shared_memory)
+
+template <class Policy, class Generator, class Comp = std::less<>>
+void test_sort(Policy &&pol, std::size_t problem_size, Generator gen,
+               Comp comp = {}) {
+  std::vector<int> data(problem_size);
+  for(int i = 0; i < problem_size; ++i)
+    data[i] = gen(i);
+  std::vector<int> host_data = data;
+
+  std::sort(pol, data.begin(), data.end(), comp);
+  
+  BOOST_CHECK(std::is_sorted(data.begin(), data.end(), comp));
+  std::sort(host_data.begin(), host_data.end(), comp);
+  BOOST_CHECK(host_data == data);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  test_sort(std::execution::par_unseq, 0, [](int i){return 0;});
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  test_sort(std::execution::par_unseq, 1, [](int i){return i;});
+}
+
+
+BOOST_AUTO_TEST_CASE(par_unseq_pow2_descending) {
+  test_sort(std::execution::par_unseq, 1024, [](int i){return -i;});
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_pow2_ascending) {
+  test_sort(std::execution::par_unseq, 1024, [](int i){return i;});
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_non_pow2_descending) {
+  test_sort(std::execution::par_unseq, 1000, [](int i){return -i;});
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_non_pow2_ascending) {
+  test_sort(std::execution::par_unseq, 1000, [](int i){return i;});
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/pstl/transform_exclusive_scan.cpp b/tests/pstl/transform_exclusive_scan.cpp
new file mode 100644
index 000000000..8ccd971dd
--- /dev/null
+++ b/tests/pstl/transform_exclusive_scan.cpp
@@ -0,0 +1,184 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_transform_exclusive_scan,
+                         enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
+    return t;
+  }
+
+  T get() const {
+    return data[0];
+  }
+private:
+  non_default_constructible(){}
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < size; ++i)
+    data.push_back(gen(i));
+
+    
+  auto unary_op = [=](auto x){
+    return op(x, x);
+  };
+
+  std::vector<T> reference = data;
+  std::transform_exclusive_scan(data.begin(), data.end(), reference.begin(),
+                                init, op, unary_op);
+
+
+  std::vector<T> device_result = data;
+  BOOST_CHECK(std::transform_exclusive_scan(pol, data.begin(), data.end(),
+                                  device_result.begin(), init, op, unary_op) ==
+              device_result.end());
+
+  
+  BOOST_CHECK(reference == device_result);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  /*using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 4>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);*/
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/pstl/transform_inclusive_scan.cpp b/tests/pstl/transform_inclusive_scan.cpp
new file mode 100644
index 000000000..7c6e2a27a
--- /dev/null
+++ b/tests/pstl/transform_inclusive_scan.cpp
@@ -0,0 +1,194 @@
+/*
+ * This file is part of AdaptiveCpp, an implementation of SYCL and C++ standard
+ * parallelism for CPUs and GPUs.
+ *
+ * Copyright The AdaptiveCpp Contributors
+ *
+ * AdaptiveCpp is released under the BSD 2-Clause "Simplified" License.
+ * See file LICENSE in the project root for full license details.
+ */
+// SPDX-License-Identifier: BSD-2-Clause
+
+#include <cstdint>
+#include <numeric>
+#include <execution>
+#include <utility>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include <boost/test/unit_test.hpp>
+
+#include "pstl_test_suite.hpp"
+
+BOOST_FIXTURE_TEST_SUITE(pstl_transform_inclusive_scan,
+                         enable_unified_shared_memory)
+
+template<class T, std::size_t PaddingSize>
+struct non_default_constructible {
+public:
+  static auto make(T x){
+    non_default_constructible<T, PaddingSize> t;
+    t.data[0] = x;
+    return t;
+  }
+
+  T get() const {
+    return data[0];
+  }
+private:
+  non_default_constructible(){}
+  alignas(PaddingSize * sizeof(T)) T data [PaddingSize];
+};
+
+template<class T>
+struct non_default_constructible<T, 0> {
+public:
+  static auto make(T x){
+    non_default_constructible<T,0> t; t.x = x;
+    return t;
+  }
+
+  T get() const {
+    return x;
+  }
+private:
+  non_default_constructible(){}
+  T x;
+};
+
+template <class T, std::size_t PaddingSize>
+bool operator==(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() == b.get();
+}
+
+template <class T, std::size_t PaddingSize>
+bool operator!=(const non_default_constructible<T, PaddingSize> &a,
+                const non_default_constructible<T, PaddingSize> &b) {
+  return a.get() != b.get();
+}
+
+template<class Policy, class Generator, class BinOp, class T>
+void test_scan(Policy&& pol, Generator&& gen, T init, BinOp op, std::size_t size) {
+  std::vector<T> data;
+  for(std::size_t i = 0; i < size; ++i)
+    data.push_back(gen(i));
+
+    
+  auto unary_op = [=](auto x){
+    return op(x, x);
+  };
+
+  std::vector<T> reference0 = data;
+  std::transform_inclusive_scan(data.begin(), data.end(), reference0.begin(),
+                                op, unary_op);
+
+  std::vector<T> reference1 = data;
+  std::transform_inclusive_scan(data.begin(), data.end(), reference1.begin(),
+                                op, unary_op, init);
+
+  std::vector<T> device_result0 = data;
+
+  BOOST_CHECK(std::transform_inclusive_scan(pol, data.begin(), data.end(),
+                                  device_result0.begin(), op, unary_op) ==
+              device_result0.end());
+
+  std::vector<T> device_result1 = data;
+    BOOST_CHECK(std::transform_inclusive_scan(pol, data.begin(), data.end(),
+                                    device_result1.begin(), op, unary_op, init) ==
+                device_result1.end());
+  
+  
+  BOOST_CHECK(reference0 == device_result0);
+  BOOST_CHECK(reference1 == device_result1);
+}
+
+inline auto get_default_generator() {
+  return [](std::size_t i) {
+    return static_cast<int>(i);
+  };
+}
+
+template<class T>
+inline auto get_non_constructible_generator() {
+  return [](std::size_t i) {
+    return T::make(i);
+  };
+}
+
+template<class T>
+auto get_non_constructible_bin_op() {
+  return [](auto a, auto b){
+    return T::make(a.get() + b.get());
+  };
+}
+
+template<int ProblemSize, class Policy>
+void run_all_tests(Policy&& pol) {
+  test_scan(std::execution::par_unseq, get_default_generator(), 3,
+            std::plus<>{}, ProblemSize);
+  test_scan(
+      std::execution::par_unseq, get_default_generator(), 3ull,
+      [](auto a, auto b) { return a * b; }, ProblemSize);
+  
+  using non_constructible_t = non_default_constructible<std::size_t, 0>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<non_constructible_t>(), 
+            non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<non_constructible_t>(), ProblemSize);
+  
+  /*using massive_non_constructible_t =
+      non_default_constructible<std::size_t, 4>;
+  test_scan(std::execution::par_unseq,
+            get_non_constructible_generator<massive_non_constructible_t>(),
+            massive_non_constructible_t::make(3ull),
+            get_non_constructible_bin_op<massive_non_constructible_t>(),
+            ProblemSize);*/
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_empty) {
+  run_all_tests<0>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_single_element) {
+  run_all_tests<1>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par_unseq);
+}
+
+BOOST_AUTO_TEST_CASE(par_unseq_large) {
+  run_all_tests<1024*1024>(std::execution::par_unseq);
+}
+
+
+BOOST_AUTO_TEST_CASE(par_empty) {
+  run_all_tests<0>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_single_element) {
+  run_all_tests<1>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_incomplete_single_work_group) {
+  run_all_tests<127>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_multiple_groups_incomplete) {
+  run_all_tests<1000>(std::execution::par);
+}
+
+BOOST_AUTO_TEST_CASE(par_large) {
+  run_all_tests<1024*1024>(std::execution::par);
+}
+
+
+
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/sycl/extensions.cpp b/tests/sycl/extensions.cpp
index 63668ca7c..507bc09b5 100644
--- a/tests/sycl/extensions.cpp
+++ b/tests/sycl/extensions.cpp
@@ -1222,5 +1222,17 @@ BOOST_AUTO_TEST_CASE(sycl_specialized) {
   sycl::free(data, q);
 }
 #endif
+#ifdef SYCL_KHR_DEFAULT_CONTEXT
+BOOST_AUTO_TEST_CASE(khr_default_context) {
+  using namespace cl;
+  sycl::queue q1;
+  sycl::queue q2;
+
+  BOOST_CHECK(q1.get_context() == q2.get_context());
+  BOOST_CHECK(q1.get_device().get_platform().khr_get_default_context() ==
+              q1.get_context());
+  BOOST_CHECK(sycl::context{} != q1.get_context());
+}
+#endif
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/sycl/group_functions/group_functions.hpp b/tests/sycl/group_functions/group_functions.hpp
index 93412dfa2..8fb264bcc 100644
--- a/tests/sycl/group_functions/group_functions.hpp
+++ b/tests/sycl/group_functions/group_functions.hpp
@@ -17,7 +17,7 @@
 #include <functional>
 #include <iostream>
 #include <limits>
-#include <math.h>
+#include <cmath>
 #include <type_traits>
 
 #include <sstream>
@@ -25,9 +25,8 @@
 
 using namespace cl;
 
-#ifndef __ACPP_ENABLE_LLVM_SSCP_TARGET__
 #define HIPSYCL_ENABLE_GROUP_ALGORITHM_TESTS
-#endif
+
 
 
 #ifdef TESTS_GROUPFUNCTION_FULL
@@ -142,7 +141,7 @@ T initialize_type(T init) {
 template<typename T, typename std::enable_if_t<!std::is_arithmetic_v<T>, int> = 0>
 ACPP_KERNEL_TARGET
 T initialize_type(elementType<T> init) {
-  constexpr size_t N = T::get_count();
+  constexpr size_t N = T::size();
 
   if constexpr (std::is_same_v<elementType<T>, bool>)
     return T{init};
@@ -221,7 +220,7 @@ inline void create_bool_test_data(std::vector<char> &buffer, size_t local_size,
 }
 
 template<typename T, int Line>
-void check_binary_reduce(std::vector<T> buffer, size_t local_size, size_t global_size,
+void check_binary_reduce(std::vector<T> buffer, std::vector<T> input, size_t local_size, size_t global_size,
                          std::vector<bool> expected, std::string name,
                          size_t break_size = 0, size_t offset = 0) {
   std::vector<std::string> cases{"everything except one false", "everything false",
@@ -268,6 +267,7 @@ void test_nd_group_function_1d(size_t elements_per_thread, DataGenerator dg,
   for (int i = 0; i < local_sizes.size(); ++i) {
     size_t local_size  = local_sizes[i];
     size_t global_size = global_sizes[i];
+    uint32_t used_sgrp_size = 0;
 
     std::vector<T> host_buf(elements_per_thread * global_size, T{});
 
@@ -277,11 +277,12 @@ void test_nd_group_function_1d(size_t elements_per_thread, DataGenerator dg,
 
     {
       sycl::buffer<T, 1> buf{host_buf.data(), host_buf.size()};
+      sycl::buffer<uint32_t, 1> used_sgrp_size_buffer(&used_sgrp_size, 1);
 
       queue.submit([&](sycl::handler &cgh) {
         using namespace sycl::access;
         auto acc = buf.template get_access<mode::read_write>(cgh);
-
+        auto sgpr_size_acc = used_sgrp_size_buffer.template get_access<mode::read_write>(cgh);
         cgh.parallel_for<class test_kernel<1, CallingLine, T>>(
           sycl::nd_range<1>{global_size, local_size},
           [=](sycl::nd_item<1> item) {
@@ -289,13 +290,13 @@ void test_nd_group_function_1d(size_t elements_per_thread, DataGenerator dg,
           auto sg = item.get_sub_group();
 
           T local_value = acc[item.get_global_linear_id()];
-
+          sgpr_size_acc[0] =  sg.get_max_local_range().size();
           f(acc, item.get_global_linear_id(), sg, g, local_value);
         });
       });
     }
 
-    vf(host_buf, original_host_buf, local_size, global_size);
+    vf(host_buf, original_host_buf,used_sgrp_size, local_size, global_size);
   }
 }
 
@@ -346,7 +347,7 @@ void test_nd_group_function_2d(size_t elements_per_thread, DataGenerator dg,
       });
     }
 
-    vf(host_buf, original_host_buf, local_size * local_size, global_size * global_size);
+    vf(host_buf, original_host_buf,0, local_size * local_size, global_size * global_size);
   }
 }
 
diff --git a/tests/sycl/group_functions/group_functions_binary_reduce.cpp b/tests/sycl/group_functions/group_functions_binary_reduce.cpp
index e8dfc56e9..992b1861e 100644
--- a/tests/sycl/group_functions/group_functions_binary_reduce.cpp
+++ b/tests/sycl/group_functions/group_functions_binary_reduce.cpp
@@ -35,9 +35,9 @@ BOOST_AUTO_TEST_CASE(group_x_of_local) {
       acc[global_linear_id] = sycl::any_of_group(g, static_cast<bool>(local_value));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
-      detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+      detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                std::vector<bool>{true, false, true, true},
                                                "any_of");
     };
@@ -55,10 +55,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_local) {
       acc[global_linear_id] = sycl::all_of_group(g, static_cast<bool>(local_value));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+          vIn, vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
           "all_of");
     };
 
@@ -75,10 +75,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_local) {
       acc[global_linear_id] = sycl::none_of_group(g, static_cast<bool>(local_value));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
           "none_of");
     };
 
@@ -95,7 +95,6 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
     using T = char;
 
     const size_t   elements_per_thread = 1;
-    const uint32_t subgroup_size = detail::get_subgroup_size(sycl::queue{});
 
     const auto data_generator = [](std::vector<T> &v, size_t local_size,
                                   size_t global_size) {
@@ -108,9 +107,9 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
         acc[global_linear_id] = sycl::any_of_group(sg, static_cast<bool>(local_value));
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+        detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                 std::vector<bool>{true, false, true, true},
                                                 "any_of", subgroup_size);
       };
@@ -125,11 +124,11 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
         acc[global_linear_id] = sycl::all_of_group(sg, static_cast<bool>(local_value));
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
-            "all_of", subgroup_size);
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
+            "all_of bool sub group", subgroup_size);
       };
 
       test_nd_group_function_1d<__LINE__, T>(elements_per_thread, data_generator,
@@ -142,10 +141,10 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_local) {
         acc[global_linear_id] = sycl::none_of_group(sg, static_cast<bool>(local_value));
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
             "none_of", subgroup_size);
       };
 
@@ -177,9 +176,9 @@ BOOST_AUTO_TEST_CASE(group_x_of_ptr_function) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
-      detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+      detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                std::vector<bool>{true, true, true, false},
                                                "any_of", 0, 2 * global_size);
     };
@@ -204,10 +203,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_ptr_function) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
           "all_of", 0, 2 * global_size);
     };
 
@@ -232,10 +231,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_ptr_function) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
           "none_of", 0, 2 * global_size);
     };
 
@@ -263,9 +262,9 @@ BOOST_AUTO_TEST_CASE(group_x_of_function) {
           sycl::any_of_group(g, static_cast<bool>(local_value), std::logical_not<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
-      detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+      detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                std::vector<bool>{true, true, true, false},
                                                "any_of");
     };
@@ -284,10 +283,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_function) {
           sycl::all_of_group(g, static_cast<bool>(local_value), std::logical_not<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
           "all_of");
     };
 
@@ -305,10 +304,10 @@ BOOST_AUTO_TEST_CASE(group_x_of_function) {
           sycl::none_of_group(g, static_cast<bool>(local_value), std::logical_not<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       detail::check_binary_reduce<T, __LINE__>(
-          vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+          vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
           "none_of");
     };
 
@@ -325,7 +324,6 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
     using T = char;
 
     const size_t   elements_per_thread = 1;
-    const uint32_t subgroup_size = detail::get_subgroup_size(sycl::queue{});
 
     const auto data_generator = [](std::vector<T> &v, size_t local_size,
                                   size_t global_size) {
@@ -339,9 +337,9 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
             sycl::any_of_group(sg, static_cast<bool>(local_value), std::logical_not<T>());
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        detail::check_binary_reduce<T, __LINE__>(vIn, local_size, global_size,
+        detail::check_binary_reduce<T, __LINE__>(vIn,vOrig, local_size, global_size,
                                                 std::vector<bool>{true, true, true, false},
                                                 "any_of", subgroup_size);
       };
@@ -357,10 +355,10 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
             sycl::all_of_group(sg, static_cast<bool>(local_value), std::logical_not<T>());
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, true, false, false},
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, true, false, false},
             "all_of", subgroup_size);
       };
 
@@ -375,10 +373,10 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
             sycl::none_of_group(sg, static_cast<bool>(local_value), std::logical_not<T>());
       };
       const auto validation_function = [=](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         detail::check_binary_reduce<T, __LINE__>(
-            vIn, local_size, global_size, std::vector<bool>{false, false, false, true},
+            vIn,vOrig, local_size, global_size, std::vector<bool>{false, false, false, true},
             "none_of", subgroup_size);
       };
 
@@ -389,4 +387,4 @@ BOOST_AUTO_TEST_CASE(sub_group_x_of_function) {
 }
 BOOST_AUTO_TEST_SUITE_END()
 
-#endif
+#endif
\ No newline at end of file
diff --git a/tests/sycl/group_functions/group_functions_misc.cpp b/tests/sycl/group_functions/group_functions_misc.cpp
index 65c7086e7..7d7ed0c69 100644
--- a/tests/sycl/group_functions/group_functions_misc.cpp
+++ b/tests/sycl/group_functions/group_functions_misc.cpp
@@ -46,7 +46,7 @@ BOOST_AUTO_TEST_CASE(group_barrier) {
       }
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = (i % local_size) * 10000;
@@ -82,7 +82,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_broadcast, T, test_types) {
       acc[global_linear_id] = sycl::group_broadcast(g, local_value);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = detail::initialize_type<T>(((int)i / local_size) * local_size) +
@@ -112,7 +112,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_broadcast, T, test_types) {
       acc[global_linear_id] = sycl::group_broadcast(g, local_value, 10);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = detail::initialize_type<T>(((int)i / local_size) * local_size + 10) +
@@ -146,7 +146,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_broadcast, T, test_types) {
       acc[global_linear_id] = sycl::group_broadcast(g, local_value, sycl::id<2>(0, 10));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected = detail::initialize_type<T>(((int)i / local_size) * local_size + 10) +
@@ -187,9 +187,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
         acc[global_linear_id] = sycl::group_broadcast(sg, local_value);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < vIn.size(); ++i) {
           int expected_base = i % local_size;
           expected_base = ((int)expected_base / subgroup_size) *
@@ -217,16 +216,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
     {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
-        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, 10);
+        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, 7);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < vIn.size(); ++i) {
           int expected_base = i % local_size;
           expected_base     = ((int)expected_base / subgroup_size) * subgroup_size;
-          expected_base += ((int)i / local_size) * local_size + 10;
+          expected_base += ((int)i / local_size) * local_size + 7;
 
           T expected = detail::initialize_type<T>(expected_base) +
                       detail::get_offset<T>(global_size);
@@ -249,16 +247,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
     {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
-        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, sycl::id<1>(10));
+        acc[global_linear_id] = sycl::group_broadcast(sg, local_value, sycl::id<1>(7));
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < vIn.size(); ++i) {
           int expected_base = i % local_size;
           expected_base     = ((int)expected_base / subgroup_size) * subgroup_size;
-          expected_base += ((int)i / local_size) * local_size + 10;
+          expected_base += ((int)i / local_size) * local_size + 7;
 
           T expected = detail::initialize_type<T>(expected_base) +
                       detail::get_offset<T>(global_size);
@@ -280,7 +277,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_broadcast, T, test_types) {
   }
 }
 
-#if !defined(REDUCED_LOCAL_MEM_USAGE)
+#if defined(ACPP_TEST_WORK_GROUP_SHUFFLE_EXT) and !defined(REDUCED_LOCAL_MEM_USAGE)
 BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
   const size_t elements_per_thread = 1;
   const auto   data_generator      = [](std::vector<T> &v, size_t local_size,
@@ -294,7 +291,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
       acc[global_linear_id] = sycl::shift_group_left(g, local_value, 1);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -330,7 +327,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
       acc[global_linear_id] = sycl::shift_group_right(g, local_value, 1);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -366,7 +363,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
       acc[global_linear_id] = sycl::permute_group_by_xor(g, local_value, 1);
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -403,7 +400,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_shuffle_like, T, test_types) {
           sycl::select_from_group(g, local_value, sycl::id<g.dimensions>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < vIn.size(); ++i) {
         T expected =
@@ -445,9 +442,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::shift_group_left(sg, local_value, 1);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
             for (size_t k = 0; k < subgroup_size; ++k) {
@@ -490,9 +486,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::shift_group_right(sg, local_value, 1);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
             for (size_t k = 0; k < subgroup_size; ++k) {
@@ -523,7 +518,6 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
           }
         }
       };
-
       test_nd_group_function_1d<__LINE__, T>(elements_per_thread, data_generator,
                                             tested_function, validation_function);
     }
@@ -534,9 +528,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::permute_group_by_xor(sg, local_value, 1);
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
             for (size_t k = 0; k < subgroup_size; ++k) {
@@ -579,9 +572,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(subgroup_shuffle_like, T, test_types) {
         acc[global_linear_id] = sycl::select_from_group(sg, local_value, sycl::id<1>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-          auto subgroup_size = detail::get_subgroup_size();
           for (size_t i = 0; i < global_size / local_size; ++i) {
             for (size_t j = 0; j < (local_size + subgroup_size - 1) / subgroup_size; ++j) {
               for (size_t k = 0; k < subgroup_size; ++k) {
diff --git a/tests/sycl/group_functions/group_functions_reduce.cpp b/tests/sycl/group_functions/group_functions_reduce.cpp
index 3a4799ed2..d131c4475 100644
--- a/tests/sycl/group_functions/group_functions_reduce.cpp
+++ b/tests/sycl/group_functions/group_functions_reduce.cpp
@@ -30,7 +30,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce_mul, T, test_types) {
       acc[global_linear_id] = sycl::reduce_over_group(g, local_value, std::multiplies<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = vOrig[i * local_size];
@@ -74,7 +74,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce, T, test_types) {
       acc[global_linear_id] = sycl::reduce_over_group(g, local_value, std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = T{};
@@ -96,7 +96,6 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce, T, test_types) {
 
     test_nd_group_function_1d<__LINE__, T>(elements_per_thread, data_generator,
                                            tested_function, validation_function);
-
     test_nd_group_function_2d<__LINE__, T>(elements_per_thread, data_generator,
                                            tested_function, validation_function);
   }
@@ -108,7 +107,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = detail::initialize_type<T>(10);
@@ -158,7 +157,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce_ptr, T, test_types) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig, size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = T{};
@@ -197,7 +196,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_reduce_ptr, T, test_types) {
       acc[global_linear_id + 2 * global_size] = local;
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       for (size_t i = 0; i < global_size / local_size; ++i) {
         T expected = detail::initialize_type<T>(10);
@@ -236,12 +235,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_reduce, T, test_types) {
     {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
-        acc[global_linear_id] = sycl::reduce_over_group(sg, local_value, std::plus<T>());
+        acc[global_linear_id] = sycl::reduce_over_group(sg, local_value, sycl::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           T    expected         = T{};
           auto actual_warp_size = local_size < subgroup_size ? local_size : subgroup_size;
@@ -267,12 +265,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_reduce, T, test_types) {
       const auto tested_function = [](auto acc, size_t global_linear_id, sycl::sub_group sg,
                                       auto g, T local_value) {
         acc[global_linear_id] = sycl::reduce_over_group(
-            sg, local_value, detail::initialize_type<T>(10), std::plus<T>());
+            sg, local_value, detail::initialize_type<T>(10), sycl::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           T    expected         = detail::initialize_type<T>(10);
           auto actual_warp_size = local_size < subgroup_size ? local_size : subgroup_size;
diff --git a/tests/sycl/group_functions/group_functions_scan.cpp b/tests/sycl/group_functions/group_functions_scan.cpp
index 89e81b89d..04ec7a7c4 100644
--- a/tests/sycl/group_functions/group_functions_scan.cpp
+++ b/tests/sycl/group_functions/group_functions_scan.cpp
@@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_mul, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::multiplies<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -77,7 +77,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan, T, test_types) {
       acc[global_linear_id] = sycl::exclusive_scan_over_group(g, local_value, std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -167,7 +167,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_ptr, T, test_types) {
       sycl::joint_exclusive_scan(g, start.get(), end.get(), out.get(), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -211,7 +211,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_ptr, T, test_types) {
                                    detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -223,7 +223,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_exclusive_scan_ptr, T, test_types) {
 
         for (size_t j = i * 2 * local_size; j < (i + 1) * local_size * 2; ++j) {
           T computed = vIn[j + global_size * 2];
-          BOOST_TEST(detail::compare_type(expected[j], computed),
+          BOOST_TEST_REQUIRE(detail::compare_type(expected[j], computed),
                      detail::type_to_string(computed)
                          << " at position " << j << " instead of "
                          << detail::type_to_string(expected[j])
@@ -258,10 +258,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_exclusive_scan, T, test_types) {
         acc[global_linear_id] = sycl::exclusive_scan_over_group(sg, local_value, std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = T{};
           auto actual_warp_size    = local_size < subgroup_size ? local_size : subgroup_size;
@@ -294,10 +293,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_exclusive_scan, T, test_types) {
             sg, local_value, detail::initialize_type<T>(10), std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig,size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
 
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = detail::initialize_type<T>(10);
@@ -340,7 +338,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan_mul, T, test_types) {
           sycl::inclusive_scan_over_group(g, local_value, std::multiplies<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -386,7 +384,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan, T, test_types) {
       acc[global_linear_id] = sycl::inclusive_scan_over_group(g, local_value, std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -423,7 +421,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan, T, test_types) {
           g, local_value, detail::initialize_type<T>(10), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -475,7 +473,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan_ptr, T, test_types) {
       sycl::joint_inclusive_scan(g, start.get(), end.get(), out.get(), std::plus<T>());
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -518,7 +516,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(group_inclusive_scan_ptr, T, test_types) {
                                    std::plus<T>(), detail::initialize_type<T>(10));
     };
     const auto validation_function = [](const std::vector<T> &vIn,
-                                        const std::vector<T> &vOrig, size_t local_size,
+                                        const std::vector<T> &vOrig,size_t, size_t local_size,
                                         size_t global_size) {
       std::vector<T> expected(vOrig.size());
 
@@ -567,10 +565,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_inclusive_scan, T, test_types) {
         acc[global_linear_id] = sycl::inclusive_scan_over_group(sg, local_value, std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
 
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = vOrig[i * local_size];
@@ -603,10 +600,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(sub_group_inclusive_scan, T, test_types) {
             sg, local_value, detail::initialize_type<T>(10), std::plus<T>());
       };
       const auto validation_function = [](const std::vector<T> &vIn,
-                                          const std::vector<T> &vOrig, size_t local_size,
+                                          const std::vector<T> &vOrig, size_t subgroup_size, size_t local_size,
                                           size_t global_size) {
         std::vector<T> expected(vOrig.size());
-        auto subgroup_size = detail::get_subgroup_size();
 
         for (size_t i = 0; i < global_size / local_size; ++i) {
           expected[i * local_size] = vOrig[i * local_size] + detail::initialize_type<T>(10);
diff --git a/tests/sycl/profiler.cpp b/tests/sycl/profiler.cpp
index a34b719bf..4f2f08793 100644
--- a/tests/sycl/profiler.cpp
+++ b/tests/sycl/profiler.cpp
@@ -95,7 +95,14 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_submit>();
     auto t13 =
         evt1.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t11 <= t12 && t12 <= t13);
+    // We cannot test that submit time is <= command start time, since
+    // in some backends (e.g. CUDA) time is only measured as elapsed time
+    // between two points in low-precision float. Submission time on the other
+    // hand is always exact. So there might be rounding errors causing
+    // t12 > t11.
+    // The same thing could in principle happen when comparing submission time
+    // with command end time, but hopefully this is less likely.
+    BOOST_CHECK(t11 <= t13 && t12 <= t13);
 
     auto evt2 = queue.submit([&](cl::sycl::handler &cgh) {
       auto acc = buf1.get_access<cl::sycl::access::mode::discard_write>(cgh);
@@ -116,7 +123,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t23 =
         evt2.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t21 <= t22 && t22 <= t23);
+    BOOST_CHECK(t21 <= t23 && t22 <= t23);
 
     auto t31 = evt3.get_profiling_info<
         cl::sycl::info::event_profiling::command_submit>();
@@ -124,7 +131,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t33 =
         evt3.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t31 <= t32 && t32 <= t33);
+    BOOST_CHECK(t31 <= t33 && t32 <= t33);
     BOOST_CHECK(t21 <= t31 && t23 <= t32);
 
     auto evt4 = queue.submit([&](cl::sycl::handler &cgh) {
@@ -143,7 +150,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t53 =
         evt5.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t51 <= t52 && t52 <= t53);
+    BOOST_CHECK(t51 <= t53 && t52 <= t53);
 
     // re-ordered
     auto t41 = evt4.get_profiling_info<
@@ -152,7 +159,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t43 =
         evt4.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t41 <= t42 && t42 <= t43);
+    BOOST_CHECK(t41 <= t43 && t42 <= t43);
 
     // usm
     auto *src = cl::sycl::malloc_shared<int>(n, queue);
@@ -166,7 +173,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t63 =
         evt6.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t61 <= t62 && t62 <= t63);
+    BOOST_CHECK(t61 <= t63 && t62 <= t63);
 
     auto evt7 = queue.submit(
         [&](cl::sycl::handler &cgh) { cgh.memcpy(dest, src, sizeof src); });
@@ -176,7 +183,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
         cl::sycl::info::event_profiling::command_start>();
     auto t73 =
         evt7.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
-    BOOST_CHECK(t71 <= t72 && t72 <= t73);
+    BOOST_CHECK(t71 <= t73 && t72 <= t73);
 
     auto evt8 = queue.submit(
         [&](cl::sycl::handler &cgh) { cgh.prefetch(dest, sizeof src); });
@@ -187,7 +194,7 @@ BOOST_AUTO_TEST_CASE(queue_profiling)
     auto t83 =
         evt8.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
     // run time may be zero if prefetching is a no-op
-    BOOST_CHECK(t81 <= t82 && t82 <= t83);
+    BOOST_CHECK(t81 <= t83 && t82 <= t83);
 
     cl::sycl::free(src, queue);
     cl::sycl::free(dest, queue);