diff --git a/README.md b/README.md index bf3b76a93..11f7157dd 100755 --- a/README.md +++ b/README.md @@ -29,11 +29,12 @@ It can also be built from source with `pip install .` from this repository. Try passing `--no-build-isolation` to `pip` if installation encounters difficulties either when building from source or installing from PyPi. Common `pip` complaints that can be resolved in this way include PyTorch versions, but other cases exist as well. Other requirements: -- Linux - NVIDIA GPU - PyTorch 1.12+ - CUDA 11.6+ +Note: on Windows, it is recommended to use x64 Native Tools Command Prompt for VS with DISTUTILS_USE_SDK=1. + For AMD cards, see additional prerequisites below. ## Usage diff --git a/csrc/selective_scan/selective_scan_bwd_kernel.cuh b/csrc/selective_scan/selective_scan_bwd_kernel.cuh index c720ba28c..a38165e63 100755 --- a/csrc/selective_scan/selective_scan_bwd_kernel.cuh +++ b/csrc/selective_scan/selective_scan_bwd_kernel.cuh @@ -14,11 +14,18 @@ #include #include #include + #define ROCM_ONLY(x) #else #include namespace cub = hipcub; + #define ROCM_ONLY(x) x #endif +#ifndef M_LOG2E +#define M_LOG2E 1.4426950408889634074 +#endif + + #include "selective_scan.h" #include "selective_scan_common.h" #include "reverse_scan.cuh" @@ -511,15 +518,9 @@ void selective_scan_bwd_launch(SSMParamsBwd ¶ms, cudaStream_t stream) { auto kernel = &selective_scan_bwd_kernel; if (kSmemSize >= 48 * 1024) { - - #ifndef USE_ROCM - C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); - #else C10_CUDA_CHECK(cudaFuncSetAttribute( - (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); - std::cerr << "Warning (selective_scan_bwd_kernel): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl; - #endif + ROCM_ONLY((void *)) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); + ROCM_ONLY(std::cerr << "Warning (selective_scan_bwd_kernel): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl); } diff --git a/csrc/selective_scan/selective_scan_fwd_kernel.cuh b/csrc/selective_scan/selective_scan_fwd_kernel.cuh index 80e9e37e3..103d1454f 100755 --- a/csrc/selective_scan/selective_scan_fwd_kernel.cuh +++ b/csrc/selective_scan/selective_scan_fwd_kernel.cuh @@ -12,11 +12,18 @@ #include #include #include + #define ROCM_ONLY(x) #else #include namespace cub = hipcub; + #define ROCM_ONLY(x) x #endif +#ifndef M_LOG2E +#define M_LOG2E 1.4426950408889634074 +#endif + + #include "selective_scan.h" #include "selective_scan_common.h" #include "static_switch.h" @@ -311,7 +318,7 @@ template void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block // processing 1 row. - constexpr int kNRows = 1; + static constexpr int kNRows = 1; BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] { BOOL_SWITCH(params.is_variable_B, kIsVariableB, [&] { BOOL_SWITCH(params.is_variable_C, kIsVariableC, [&] { @@ -329,14 +336,9 @@ void selective_scan_fwd_launch(SSMParamsBase ¶ms, cudaStream_t stream) { if (kSmemSize >= 48 * 1024) { - #ifndef USE_ROCM - C10_CUDA_CHECK(cudaFuncSetAttribute( - kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); - #else C10_CUDA_CHECK(cudaFuncSetAttribute( - (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); - std::cerr << "Warning (selective_scan_fwd_kernel): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl; - #endif + ROCM_ONLY((void *)) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); + ROCM_ONLY(std::cerr << "Warning (selective_scan_fwd_kernel): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl); } kernel<<>>(params); diff --git a/csrc/selective_scan/static_switch.h b/csrc/selective_scan/static_switch.h index 7920ac045..5b24446e0 100644 --- a/csrc/selective_scan/static_switch.h +++ b/csrc/selective_scan/static_switch.h @@ -16,10 +16,10 @@ #define BOOL_SWITCH(COND, CONST_NAME, ...) \ [&] { \ if (COND) { \ - constexpr bool CONST_NAME = true; \ + static constexpr bool CONST_NAME = true; \ return __VA_ARGS__(); \ } else { \ - constexpr bool CONST_NAME = false; \ + static constexpr bool CONST_NAME = false; \ return __VA_ARGS__(); \ } \ }() diff --git a/pyproject.toml b/pyproject.toml index 5831fe66e..661d20f6f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,8 @@ classifiers = [ ] dependencies = [ "torch", - "triton", + "triton ; sys_platform != 'win32'", + "triton-windows ; sys_platform == 'win32'", "ninja", "einops", "transformers",