Enable AOTriton BWD V3 API #382

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

Micky774 wants to merge 32 commits into dev from zain/aotriton-bwd

3rdparty/aotriton

Submodule aotriton updated 13 files

+9 −2		.ci/build-release.sh
+6 −3		.ci/releasesuite-git-head.sh
+2 −2		CMakeLists.txt
+41 −0		dockerfile/input/docker-script-build-altwheel.sh
+7 −1		dockerfile/input/docker-script-build.sh
+3 −1		include/aotriton/util.h
+1 −1		tritonsrc/attn_torch_function.py
+1 −1		v3python/compile.py
+15 −5		v3python/gpu_targets.py
+1 −1		v3src/flash/aiter_bwd.cc
+1 −1		v3src/flash/attn_bwd.cc
+1 −1		v3src/flash/attn_fwd.cc
+4 −0		v3src/util.cc

tests/pytorch/attention/test_attention.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -723,6 +723,7 @@ def test_dpa_alibi_slopes(dtype, model_configs, model): @@
         "layout_2_1": ModelConfig(
 , 2048, 24, 256, attn_mask_type="causal", attn_bias_type="post_scale_bias"
         ),
+        "layout_3_0": ModelConfig(1, 2048, 12, 64, attn_mask_type="causal"),
     }
@@ Expand Down Expand Up / @@ -1281,17 +1282,60 @@ def test_transformer_layer( @@
         # FusedAttention backend
         if fused_attn_supported:
-            fused_attn_fwd, fused_attn_bwd = _run_transformer_layer(
-                dtype,
-                config,
-                "FusedAttention",
-                ckpt_attn,
-                qkv_format,
-                workspace_opt,
-                fused_qkv_params,
-                RoPE,
-                is_training,
-            )
+            if len(fused_attn_backends) == 1:
+                fused_attn_fwd, fused_attn_bwd = _run_transformer_layer(
+                    dtype,
+                    config,
+                    "FusedAttention",
+                    ckpt_attn,
+                    qkv_format,
+                    workspace_opt,
+                    fused_qkv_params,
+                    RoPE,
+                    is_training,
+                )
+            elif len(fused_attn_backends) == 2:
+                os.environ["NVTE_FUSED_ATTN_CK"] = "0"
+                os.environ["NVTE_FUSED_ATTN_AOTRITON"] = "1"
+                fused_attn_fwd, fused_attn_bwd = _run_transformer_layer(
+                    dtype,
+                    config,
+                    "FusedAttention",
+                    ckpt_attn,
+                    qkv_format,
+                    workspace_opt,
+                    fused_qkv_params,
+                    RoPE,
+                    is_training,
+                )
+                os.environ["NVTE_FUSED_ATTN_CK"] = "1"
+                os.environ["NVTE_FUSED_ATTN_AOTRITON"] = "0"
+                fused_attn_fwd_1, fused_attn_bwd_1 = _run_transformer_layer(
+                    dtype,
+                    config,
+                    "FusedAttention",
+                    ckpt_attn,
+                    qkv_format,
+                    workspace_opt,
+                    fused_qkv_params,
+                    RoPE,
+                    is_training,
+                )
+                os.environ["NVTE_CK_USES_FWD_V3"] = "1"
+                os.environ["NVTE_CK_USES_BWD_V3"] = "1"
+                fused_attn_fwd_2, fused_attn_bwd_2 = _run_transformer_layer(
+                    dtype,
+                    config,
+                    "FusedAttention",
+                    ckpt_attn,
+                    qkv_format,
+                    workspace_opt,
+                    fused_qkv_params,
+                    RoPE,
+                    is_training,
+                )
         # FlashAttention backend
         if flash_attn_supported:
@@ Expand Down Expand Up / @@ -1320,6 +1364,15 @@ def test_transformer_layer( @@
             logging.info("[test_transformer_layer]: fused attn vs flash attn")
             torch.testing.assert_close(fused_attn_fwd, flash_attn_fwd, **tols)
             torch.testing.assert_close(fused_attn_bwd, flash_attn_bwd, **tols)
+        if IS_HIP_EXTENSION and fused_attn_supported and len(fused_attn_backends) == 2:
+            logging.info("[test_transformer_layer]: fused attn backend 0 vs 1")
+            torch.testing.assert_close(fused_attn_fwd, fused_attn_fwd_1, **tols)
+            for i, _ in enumerate(fused_attn_bwd):
+                torch.testing.assert_close(fused_attn_bwd[i], fused_attn_bwd_1[i], **tols)
+            logging.info("[test_transformer_layer]: fused attn backend 0 vs 2")
+            torch.testing.assert_close(fused_attn_fwd, fused_attn_fwd_2, **tols)
+            for i, _ in enumerate(fused_attn_bwd):
+                torch.testing.assert_close(fused_attn_bwd[i], fused_attn_bwd_2[i], **tols)
     @pytest.mark.skipif(get_cudnn_version() < (8, 9, 1), reason="cuDNN 8.9.1+ is required.")
@@ Expand Down @@

tests/pytorch/test_numerics.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -1212,7 +1212,6 @@ def _test_dpa_accuracy(block, bs, dtype, config): @@
         query.retain_grad()
         key.retain_grad()
         value.retain_grad()
         out = block(query, key, value, attention_mask=mask)
         loss = out.sum()
         loss.backward()
@@ Expand Down Expand Up / @@ -1256,7 +1255,8 @@ def test_dpa_accuracy(dtype, bs, model): @@
         else:
             assert_allclose(te_outputs[0], torch_outputs[0], 5e-2)
-        for te_output, torch_output in zip(te_outputs[1:], torch_outputs[1:]):
+        for idx, outs in enumerate(zip(te_outputs[1:], torch_outputs[1:])):
+            te_output, torch_output = outs
             assert_allclose(te_output, torch_output, atol=5e-2, rtol=1e-2)
@@ Expand Down @@

transformer_engine/common/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -8,7 +8,6 @@ cmake_minimum_required(VERSION 3.21) @@
     option(USE_ROCM "Use ROCm" ON)
     option(USE_FUSED_ATTN_AOTRITON "Use aotriton backend" ON)
-    option(USE_FUSED_ATTN_AOTRITON_BUILD_GPU_KERNELS "Build AOTriton GPU kernels" OFF)
     option(USE_FUSED_ATTN_CK "Use ck backend" ON)
     set(USE_CUDA OFF)
@@ Expand Down @@

transformer_engine/common/aotriton/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -20,7 +20,7 @@ if(NOT DEFINED AOTRITON_PATH) @@
             set(AOTRITON_NOIMAGE_MODE ON)
         endif()
-        set(__AOTRITON_VER "0.11.1b")
+        set(__AOTRITON_VER "0.11.2b")
         set(__AOTRITON_IMAGE_LIST
             "amd-gfx942"
             "amd-gfx950"
@@ Expand Down @@

transformer_engine/common/fused_attn_rocm/fused_attn.cpp

-Original file line number
+Diff line change
@@ Expand Up @@
         fused_attn_aotriton_bwd_qkvpacked(
           b, h, max_seqlen, d,
           attn_scale, dropout,
+          window_size_left, window_size_right,
           qkv_layout, bias_type, attn_mask_type,
           input_QKV, input_O, input_dO, output_S,
           output_dQKV,
@@ Expand Down Expand Up / @@ -678,6 +679,7 @@ void nvte_fused_attn_bwd_kvpacked( @@
         fused_attn_aotriton_bwd_kvpacked(
           b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d,
           attn_scale, dropout,
+          window_size_left, window_size_right,
           qkv_layout, bias_type, attn_mask_type,
           input_Q, input_KV, input_O, input_dO,
           output_S,
@@ Expand Down Expand Up @@
         fused_attn_aotriton_bwd(
           b, h_q, h_kv, max_seqlen_q, max_seqlen_kv, d_qk,
           attn_scale, dropout,
+          window_size_left, window_size_right,
           qkv_layout, bias_type, attn_mask_type,
           input_Q, input_K, input_V, input_O, input_dO,
           output_S,
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Enable AOTriton BWD V3 API #382

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Enable AOTriton BWD V3 API #382

Are you sure you want to change the base?

Uh oh!

Enable AOTriton BWD V3 API #382

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!