diff --git a/csrc/common.cuh b/csrc/common.cuh index 61bef3c27..03d934270 100644 --- a/csrc/common.cuh +++ b/csrc/common.cuh @@ -7,11 +7,17 @@ // Warp size #if BNB_HIP -// CDNA (gfx9xx) = 64, RDNA = 32. +// CDNA (gfx9xx) = 64, RDNA (gfx10xx/gfx11xx/gfx12xx) = 32. +// __AMDGCN_WAVEFRONT_SIZE is not defined by all compiler versions (removed since ROCm 7.0), +// so fall back to architecture-family macros when it is absent. +// This is a macro that is defined by the compiler during each device-code pass and as such +// should only be used inside kernels. #ifdef __AMDGCN_WAVEFRONT_SIZE #define BNB_WARP_SIZE __AMDGCN_WAVEFRONT_SIZE +#elif defined(__GFX9__) +#define BNB_WARP_SIZE 64 // CDNA #else -#define BNB_WARP_SIZE 64 // Safe default for HIP (matches CDNA) +#define BNB_WARP_SIZE 32 // RDNA and other #endif #else #define BNB_WARP_SIZE 32 diff --git a/csrc/ops.cu b/csrc/ops.cu index ef13678e4..c1f8e65bc 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -10,6 +10,26 @@ #define ERR_NOT_IMPLEMENTED 100 +#if BNB_HIP +#include +#include + +// NOTE: This queries device 0 once and caches the result. On mixed RDNA+CDNA +// systems (warp size 32 vs 64) this will return the wrong value for whichever +// device doesn't match device 0. +static int bnb_host_warp_size() { + static std::atomic warp_size{0}; + int ws = warp_size.load(std::memory_order_relaxed); + if (ws == 0) { + (void)hipDeviceGetAttribute(&ws, hipDeviceAttributeWarpSize, 0); + warp_size.store(ws, std::memory_order_relaxed); + } + return ws; +} +#else +static constexpr int bnb_host_warp_size() { return 32; } +#endif + using std::cout; using std::endl; @@ -35,10 +55,16 @@ void quantizeBlockwise( kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); else if (blocksize == 64) { #if BNB_HIP - // On HIP with 64-wide warps (CDNA), use specialized kernel for 4-bit types if constexpr (DATA_TYPE > 0) { - kQuantizeBlockwiseSmall - <<<(num_blocks + 1) / 2, 64>>>(code, A, absmax, out, rand, rand_offset, n); + if (bnb_host_warp_size() == 64) { + // CDNA: kQuantizeBlockwiseSmall is compiled with THREADS=64 + kQuantizeBlockwiseSmall + <<<(num_blocks + 1) / 2, 64>>>(code, A, absmax, out, rand, rand_offset, n); + } else { + // RDNA: standard kernel (same as CUDA path) + kQuantizeBlockwise + <<>>(code, A, absmax, out, rand, rand_offset, n); + } } else { kQuantizeBlockwise<<>>(code, A, absmax, out, rand, rand_offset, n); } @@ -407,8 +433,7 @@ void gemm_4bit_inference_naive( int num_blocks = (m + 3) / 4; #if BNB_HIP - // On 64-wide warp architectures, each warp processes 2 rows instead of 4 - if (BNB_WARP_SIZE == 64) { + if (bnb_host_warp_size() == 64) { num_blocks = (m + 1) / 2; } #endif