Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
option(onnxruntime_ENABLE_CUDA_FP4_QMOE "Build CUDA QMoE FP4 kernels" OFF)
option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF)
option(onnxruntime_USE_INT4_KV_CACHE "Build cuda kernels for int4 kv cache" OFF)
option(onnxruntime_USE_FP8_KV_CACHE "Build cuda kernels for fp8 kv cache" ON)
Expand Down Expand Up @@ -785,8 +786,8 @@ if (onnxruntime_USE_CUDA)
endif()

if (onnxruntime_QUICK_BUILD)
message( STATUS "Quick build mode: Flash attention limited to head dimension 128 only")
list(APPEND ORT_PROVIDER_FLAGS -DORT_QUICK_BUILD=1)
message( STATUS "Quick build mode: reducing selected CUDA/CUTLASS kernel instantiations")
list(APPEND ORT_PROVIDER_FLAGS -DORT_QUICK_BUILD=1)
endif()

if (onnxruntime_USE_INT4_KV_CACHE)
Expand Down Expand Up @@ -1469,6 +1470,28 @@ if (onnxruntime_USE_CUDA)
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
add_definitions("-DENABLE_FP4")
message(STATUS "CUDA Toolkit version is greater or equal than 12.8, enable -DENABLE_FP4 flag")
if(onnxruntime_ENABLE_CUDA_FP4_QMOE)
add_definitions("-DENABLE_CUDA_FP4_QMOE")
message(STATUS "CUDA FP4 QMoE kernels enabled")
endif()
# Check if any target architecture supports FP4 kernels (sm_100+).
# If not, define PLACEHOLDER_KERNELS so that stub implementations are used
# instead of compiling the full CUTLASS sm_100/sm_120 kernel templates,
# which are extremely slow to compile for unsupported architectures.
set(_has_sm100_plus FALSE)
foreach(_arch ${CMAKE_CUDA_ARCHITECTURES})
string(REGEX MATCH "^[0-9]+" _arch_num "${_arch}")
if(_arch_num MATCHES "^[0-9]+$" AND _arch_num GREATER_EQUAL 100)
set(_has_sm100_plus TRUE)
break()
endif()
endforeach()
if(NOT _has_sm100_plus)
add_definitions("-DPLACEHOLDER_KERNELS")
message(STATUS "No sm_100+ target architecture found, using placeholder FP4 kernels")
endif()
elseif(onnxruntime_ENABLE_CUDA_FP4_QMOE)
message(FATAL_ERROR "onnxruntime_ENABLE_CUDA_FP4_QMOE requires CUDA Toolkit version 12.8 or newer")
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
Expand Down
14 changes: 14 additions & 0 deletions cmake/external/cuda_configuration.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,20 @@ macro(setup_cuda_architectures)
add_definitions("-DHAS_SM80_OR_LATER")
endif()

unset(ORT_HAS_SM90_OR_LATER)
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_ORIG)
if(CUDA_ARCH MATCHES "^([0-9]+)")
if(CMAKE_MATCH_1 GREATER_EQUAL 90)
set(ORT_HAS_SM90_OR_LATER ON)
break()
endif()
endif()
endforeach()

if(ORT_HAS_SM90_OR_LATER)
add_definitions("-DHAS_SM90_OR_LATER")
endif()

set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/cutlass.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ onnxruntime_fetchcontent_declare(

FetchContent_GetProperties(cutlass)
if(NOT cutlass_POPULATED)
FetchContent_Populate(cutlass)
FetchContent_MakeAvailable(cutlass)
endif()
16 changes: 16 additions & 0 deletions cmake/onnxruntime_providers_cpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,22 @@ if(onnxruntime_QUICK_BUILD)
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|192|256)")
endif()

# The SM90 mixed FP4 launcher is enabled for the native MXFP4 W4A16 path. Keep the
# fallback stub out of the build so it does not provide duplicate instantiations.
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "moe_gemm_tma_ws_sm90_mixed_fp4_stub\\.cu")
if(NOT onnxruntime_ENABLE_CUDA_FP4_QMOE)
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_.*\\.generated\\.cu")
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "moe_gemm_tma_ws_sm120_fp4_.*\\.generated\\.cu")
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "moe_gemm_kernels_(fp16|bf16)_fp4\\.cu")
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "moe_gemm_kernels_fp4_fp4\\.cu")
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "moe_gemm_kernels_fp8_fp4\\.cu")
else()
# CUDA 13 PTXAS does not complete the FP4 M=128/N=64 pingpong specializations in
# this build configuration. The dispatcher routes that tile through cooperative
# mainloop variants instead, so exclude only those unused generated units.
list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "moe_gemm_tma_ws_sm90_fp4_(fp16|bf16)_m128_n64_cm[12]_cn[12]_pp\\.generated\\.cu")
endif()

file(GLOB_RECURSE onnxruntime_js_contrib_ops_cc_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/contrib_ops/js/*.h"
"${ONNXRUNTIME_ROOT}/contrib_ops/js/*.cc"
Expand Down
15 changes: 11 additions & 4 deletions cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@
)
endif()
# add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cuda_contrib_ops_cc_srcs} ${onnxruntime_cuda_contrib_ops_cu_srcs})
list(APPEND onnxruntime_providers_cuda_src ${onnxruntime_cuda_contrib_ops_cc_srcs} ${onnxruntime_cuda_contrib_ops_cu_srcs})
endif()

Expand Down Expand Up @@ -197,7 +196,7 @@
# Note: The minimum required CUDA version is greater than 11.3.
# CUDA 11.3+ supports parallel compilation
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver-threads
set(onnxruntime_NVCC_THREADS "1" CACHE STRING "Number of threads that NVCC can use for compilation.")
set(onnxruntime_NVCC_THREADS "4" CACHE STRING "Number of threads that NVCC can use for compilation.")
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")

# suppress warnings like this:
Expand All @@ -214,6 +213,8 @@
endif()
# skip diagnosis error caused by cuda header files
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>")
# CUDA 12.8 also reports deprecated implicit by-copy 'this' captures from CUTLASS headers.
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2908>")
endif()

if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
Expand Down Expand Up @@ -310,7 +311,7 @@
message( WARNING "To compile with NHWC ops enabled please compile against cuDNN 9 or newer." )
endif()
endif()
target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas CUDNN::cudnn_all cudnn_frontend CUDA::curand CUDA::cufft CUDA::cudart
target_link_libraries(${target} PRIVATE CUDA::cublasLt CUDA::cublas CUDNN::cudnn_all cudnn_frontend CUDA::curand CUDA::cufft CUDA::cudart CUDA::nvrtc CUDA::cuda_driver
${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 safeint_interface)
endif()

Expand All @@ -333,15 +334,21 @@
set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")

if("90" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
if(ORT_HAS_SM90_OR_LATER)
target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas=-w>)
target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-DCUTLASS_ENABLE_GDC_FOR_SM90=1>)
target_compile_definitions(${target} PRIVATE COMPILE_HOPPER_TMA_GEMMS)
target_compile_definitions(${target} PRIVATE COMPILE_HOPPER_TMA_GROUPED_GEMMS)
if (MSVC)
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /bigobj>")
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4172>")
endif()
endif()

if("120" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
target_compile_definitions(${target} PRIVATE COMPILE_BLACKWELL_SM120_TMA_GROUPED_GEMMS)
endif()

if (onnxruntime_ENABLE_CUDA_PROFILING) # configure cupti for cuda profiling
target_link_libraries(${target} PRIVATE CUDA::cupti)
endif()
Expand Down
1 change: 1 addition & 0 deletions cmake/onnxruntime_providers_cuda_plugin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
"$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>"
"$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>"
"$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=2908>"
)

if (MSVC)
Expand Down
11 changes: 11 additions & 0 deletions cmake/onnxruntime_python.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,17 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
${pybind11_lib}
Python::NumPy
)

if (onnxruntime_USE_CUDA)
target_sources(onnxruntime_pybind11_state PRIVATE
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/llm/fpA_intB_gemm_adaptor.cu"
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu"
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_call.cc"
)
include(cutlass)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${cutlass_SOURCE_DIR}/include)
endif()

set(onnxruntime_pybind11_state_dependencies
${onnxruntime_EXTERNAL_DEPENDENCIES}
${pybind11_dep}
Expand Down
Loading
Loading