diff --git a/csrc/common.cuh b/csrc/common.cuh
index 61bef3c27..03d934270 100644
--- a/csrc/common.cuh
+++ b/csrc/common.cuh
@@ -7,11 +7,17 @@
 // Warp size
 
 #if BNB_HIP
-// CDNA (gfx9xx) = 64, RDNA = 32.
+// CDNA (gfx9xx) = 64, RDNA (gfx10xx/gfx11xx/gfx12xx) = 32.
+// __AMDGCN_WAVEFRONT_SIZE is not defined by all compiler versions (removed since ROCm 7.0),
+// so fall back to architecture-family macros when it is absent.
+// This is a macro that is defined by the compiler during each device-code pass and as such
+// should only be used inside kernels.
 #ifdef __AMDGCN_WAVEFRONT_SIZE
 #define BNB_WARP_SIZE __AMDGCN_WAVEFRONT_SIZE
+#elif defined(__GFX9__)
+#define BNB_WARP_SIZE 64 // CDNA
 #else
-#define BNB_WARP_SIZE 64 // Safe default for HIP (matches CDNA)
+#define BNB_WARP_SIZE 32 // RDNA and other
 #endif
 #else
 #define BNB_WARP_SIZE 32
diff --git a/csrc/ops.cu b/csrc/ops.cu
index ef13678e4..c1f8e65bc 100644
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -10,6 +10,26 @@
 
 #define ERR_NOT_IMPLEMENTED 100
 
+#if BNB_HIP
+#include <atomic>
+#include <hip/hip_runtime.h>
+
+// NOTE: This queries device 0 once and caches the result. On mixed RDNA+CDNA
+// systems (warp size 32 vs 64) this will return the wrong value for whichever
+// device doesn't match device 0.
+static int bnb_host_warp_size() {
+    static std::atomic<int> warp_size{0};
+    int ws = warp_size.load(std::memory_order_relaxed);
+    if (ws == 0) {
+        (void)hipDeviceGetAttribute(&ws, hipDeviceAttributeWarpSize, 0);
+        warp_size.store(ws, std::memory_order_relaxed);
+    }
+    return ws;
+}
+#else
+static constexpr int bnb_host_warp_size() { return 32; }
+#endif
+
 using std::cout;
 using std::endl;
 
@@ -35,10 +55,16 @@ void quantizeBlockwise(
         kQuantizeBlockwise<T, 128, 2, 0, DATA_TYPE><<<num_blocks, 64>>>(code, A, absmax, out, rand, rand_offset, n);
     else if (blocksize == 64) {
 #if BNB_HIP
-        // On HIP with 64-wide warps (CDNA), use specialized kernel for 4-bit types
         if constexpr (DATA_TYPE > 0) {
-            kQuantizeBlockwiseSmall<T, DATA_TYPE>
-                <<<(num_blocks + 1) / 2, 64>>>(code, A, absmax, out, rand, rand_offset, n);
+            if (bnb_host_warp_size() == 64) {
+                // CDNA: kQuantizeBlockwiseSmall is compiled with THREADS=64
+                kQuantizeBlockwiseSmall<T, DATA_TYPE>
+                    <<<(num_blocks + 1) / 2, 64>>>(code, A, absmax, out, rand, rand_offset, n);
+            } else {
+                // RDNA: standard kernel (same as CUDA path)
+                kQuantizeBlockwise<T, 64, 2, 0, DATA_TYPE>
+                    <<<num_blocks, 32>>>(code, A, absmax, out, rand, rand_offset, n);
+            }
         } else {
             kQuantizeBlockwise<T, 64, 2, 0, DATA_TYPE><<<num_blocks, 32>>>(code, A, absmax, out, rand, rand_offset, n);
         }
@@ -407,8 +433,7 @@ void gemm_4bit_inference_naive(
 
     int num_blocks = (m + 3) / 4;
 #if BNB_HIP
-    // On 64-wide warp architectures, each warp processes 2 rows instead of 4
-    if (BNB_WARP_SIZE == 64) {
+    if (bnb_host_warp_size() == 64) {
         num_blocks = (m + 1) / 2;
     }
 #endif