From 4b6b9cb8f9e5a9d6931ba38ebba1a57dd446c901 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 11 Feb 2026 22:14:55 -0800
Subject: [PATCH 01/10] support qwen3.5 MOE PTQ

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/layer_utils.py          |   7 +-
 modelopt/torch/export/unified_export_hf.py    |  48 +++++--
 .../torch/quantization/plugins/huggingface.py | 120 ++++++++++++++++++
 modelopt/torch/utils/dataset_utils.py         |   4 +-
 4 files changed, 168 insertions(+), 11 deletions(-)

diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
index 9c68899d9..f9c95ff67 100755
--- a/modelopt/torch/export/layer_utils.py
+++ b/modelopt/torch/export/layer_utils.py
@@ -339,6 +339,7 @@ def is_moe(module: nn.Module) -> bool:
             "Qwen2MoeSparseMoeBlock".lower(),
             "Qwen3MoeSparseMoeBlock".lower(),
             "Qwen3NextSparseMoeBlock".lower(),
+            "Qwen3_5MoeSparseMoeBlock".lower(),
         ]
     )
 
@@ -1006,6 +1007,7 @@ def module_match_name_list(module, name_list):
             "Qwen2MoeSparseMoeBlock",
             "Qwen3MoeSparseMoeBlock",
             "Qwen3NextSparseMoeBlock",
+            "Qwen3_5MoeSparseMoeBlock",
             "DeepseekMoE",
         ],
     ):
@@ -1141,7 +1143,10 @@ def set_expert_quantizer_amax(
     # Apply target amax to quantizers that need it
     for module, attr_name, quantizer in all_quantizers:
         # Check if quantizer needs amax (use property for consistency)
-        needs_amax = getattr(quantizer, "amax", None) is None
+        # Also treat zero amax as needing recalibration — a zero amax is never valid
+        # and indicates the quantizer wasn't activated during calibration
+        amax = getattr(quantizer, "amax", None)
+        needs_amax = amax is None or (isinstance(amax, torch.Tensor) and torch.all(amax == 0))
 
         # Skip dynamic quantizers for input quantizers
         if "input_quantizer" in attr_name and getattr(quantizer, "_dynamic", False):
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 36a01bd73..3708cf137 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -589,7 +589,7 @@ def _process_quantized_modules(
     """
     fsdp_module_to_reshard = None
 
-    for _, sub_module in model.named_modules():
+    for name, sub_module in model.named_modules():
         # Optimization to perform resharding only once per decoder layer to avoid extra communication overhead
         if isinstance(sub_module, FSDPModule):
             # Every time we encounter a new FSDPModule, the previous decoder layer is fully processed.
@@ -610,8 +610,14 @@ def _process_quantized_modules(
             sub_module.unpack_weight()
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             if is_quantlinear(sub_module):
-                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
-                    _export_quantized_weight(sub_module, dtype)
+                try:
+                    with fsdp2_aware_weight_update(model, sub_module, reshard=False):
+                        _export_quantized_weight(sub_module, dtype)
+                except AssertionError as e:
+                    raise AssertionError(
+                        f"Failed to export module '{name}' "
+                        f"(type={type(sub_module).__name__}): {e}"
+                    ) from e
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
                 or "GptOssExperts" in type(sub_module).__name__
@@ -687,6 +693,16 @@ def _export_transformers_checkpoint(
                                     modules=[linear_module],
                                     quantizer_attrs=["input_quantizer"],
                                 )
+                elif "Qwen3_5MoeExperts" in type(sub_module.experts).__name__:
+                    # Handle Qwen3.5 MoE experts which use gate_proj/up_proj/down_proj ModuleLists
+                    for expert_linear_name in ["gate_proj", "up_proj", "down_proj"]:
+                        if hasattr(sub_module.experts, expert_linear_name):
+                            linear_modulelist = getattr(sub_module.experts, expert_linear_name)
+                            if hasattr(linear_modulelist, "__iter__"):
+                                set_expert_quantizer_amax(
+                                    modules=list(linear_modulelist),
+                                    quantizer_attrs=["input_quantizer"],
+                                )
                 elif isinstance(sub_module.experts, collections.abc.Iterable):
                     # For other MoE models (like Mixtral) with iterable experts
                     try:
@@ -1047,11 +1063,27 @@ def export_hf_checkpoint(
             model.hf_quantizer = None
 
         # Save model
-        model.save_pretrained(
-            export_dir,
-            state_dict={**post_state_dict, **(extra_state_dict or {})},
-            save_modelopt_state=save_modelopt_state,
-        )
+        # Temporarily disable revert_weight_conversion if available — it doesn't handle
+        # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
+        _patched_revert = False
+        try:
+            import transformers.core_model_loading as _cml
+
+            _original_revert = _cml.revert_weight_conversion
+            _cml.revert_weight_conversion = lambda model, state_dict: state_dict
+            _patched_revert = True
+        except (ImportError, AttributeError):
+            pass
+
+        try:
+            model.save_pretrained(
+                export_dir,
+                state_dict={**post_state_dict, **(extra_state_dict or {})},
+                save_modelopt_state=save_modelopt_state,
+            )
+        finally:
+            if _patched_revert:
+                _cml.revert_weight_conversion = _original_revert
 
         original_config = f"{export_dir}/config.json"
         config_data = {}
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index d58c388a1..a7cb46644 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -734,6 +734,86 @@ def forward(
         return next_states
 
 
+class _QuantQwen3_5MoeExperts(QuantModule):
+    def _setup(self):
+        """Modify the Qwen3_5MoeExperts by using nn.Linear layers."""
+        from accelerate import init_empty_weights
+
+        dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device
+
+        def _copy_weight(module, weight):
+            module.to_empty(device=device)
+            with torch.no_grad():
+                module.weight.data = weight.detach().data.to(dtype=dtype, device=device)
+
+        expert_dim = self.intermediate_dim
+
+        with init_empty_weights():
+            gate_proj = nn.ModuleList(
+                [
+                    nn.Linear(self.hidden_dim, expert_dim, bias=False)
+                    for _ in range(self.num_experts)
+                ]
+            )
+            up_proj = nn.ModuleList(
+                [
+                    nn.Linear(self.hidden_dim, expert_dim, bias=False)
+                    for _ in range(self.num_experts)
+                ]
+            )
+            down_proj = nn.ModuleList(
+                [
+                    nn.Linear(expert_dim, self.hidden_dim, bias=False)
+                    for _ in range(self.num_experts)
+                ]
+            )
+
+        for idx in range(self.num_experts):
+            # gate_up_proj shape: (num_experts, 2*intermediate_dim, hidden_dim)
+            # Already in (out_features, in_features) format, no transpose needed
+            _copy_weight(gate_proj[idx], self.gate_up_proj[idx, :expert_dim, :])
+            _copy_weight(up_proj[idx], self.gate_up_proj[idx, expert_dim:, :])
+            # down_proj shape: (num_experts, hidden_dim, intermediate_dim)
+            # Already in (out_features, in_features) format
+            _copy_weight(down_proj[idx], self.down_proj[idx])
+
+        delattr(self, "gate_up_proj")
+        delattr(self, "down_proj")
+        self.gate_proj = gate_proj
+        self.up_proj = up_proj
+        self.down_proj = down_proj
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        top_k_index: torch.Tensor,
+        top_k_weights: torch.Tensor,
+    ) -> torch.Tensor:
+        final_hidden_states = torch.zeros_like(hidden_states)
+        with torch.no_grad():
+            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
+            expert_mask = expert_mask.permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_idx = expert_idx[0]
+            if expert_idx == self.num_experts:
+                continue
+            with torch.no_grad():
+                top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
+            current_state = hidden_states[token_idx]
+            gate = self.gate_proj[expert_idx](current_state)
+            up = self.up_proj[expert_idx](current_state)
+            current_hidden_states = self.act_fn(gate) * up
+            current_hidden_states = self.down_proj[expert_idx](current_hidden_states)
+            current_hidden_states = (
+                current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
+            )
+            final_hidden_states.index_add_(
+                0, token_idx, current_hidden_states.to(final_hidden_states.dtype)
+            )
+        return final_hidden_states
+
+
 class _QuantDbrxFFN(_QuantSparseMoe):
     @property
     def num_experts(self):
@@ -882,6 +962,46 @@ def unpack_weight(self):
     pass
 
 
+class _QuantQwen3_5MoeSparseMoeBlock(_QuantSparseMoe):
+    """Qwen3.5 MoE stores top_k/num_experts in the router (self.gate), not as direct attributes.
+
+    We override forward instead of just bridging attributes because the router (self.gate)
+    uses its own top_k internally for routing decisions. We must modify self.gate.top_k
+    directly so all experts see calibration data.
+    """
+
+    def _setup(self):
+        self.num_experts = self.experts.num_experts
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if any(getattr(m, "_if_calib", False) for m in self.experts.modules()):
+            # Force all tokens to all experts during calibration
+            original_top_k = self.gate.top_k
+            self.gate.top_k = self.num_experts
+            super(_QuantSparseMoe, self).forward(hidden_states)
+            self.gate.top_k = original_top_k
+        return super(_QuantSparseMoe, self).forward(hidden_states)
+
+
+try:
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+        Qwen3_5MoeExperts,
+        Qwen3_5MoeSparseMoeBlock,
+    )
+
+    if Qwen3_5MoeSparseMoeBlock not in QuantModuleRegistry:
+        QuantModuleRegistry.register({Qwen3_5MoeSparseMoeBlock: "hf.Qwen3_5MoeSparseMoeBlock"})(
+            _QuantQwen3_5MoeSparseMoeBlock
+        )
+
+    if Qwen3_5MoeExperts not in QuantModuleRegistry:
+        QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})(
+            _QuantQwen3_5MoeExperts
+        )
+except ImportError:
+    pass
+
+
 class _QuantGptOssExperts(_QuantFunctionalMixin):
     """Quantized wrapper for `transformers.GptOssExperts`.
 
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
index 0da8ad726..e71a01cd1 100644
--- a/modelopt/torch/utils/dataset_utils.py
+++ b/modelopt/torch/utils/dataset_utils.py
@@ -298,7 +298,7 @@ def get_dataset_dataloader(
         An instance of dataloader.
     """
     assert tokenizer is not None, "Please provide a tokenizer."
-    # batch_encode_plus will modify the tokenizer in place, so we need to clone it.
+    # Tokenizer encoding may modify the tokenizer in place, so we need to clone it.
     tokenizer = copy.deepcopy(tokenizer)
 
     if tokenizer.padding_side != "left":
@@ -323,7 +323,7 @@ def get_dataset_dataloader(
         )
         all_samples.extend(samples)
 
-    batch_encoded = tokenizer.batch_encode_plus(
+    batch_encoded = tokenizer(
         all_samples,
         return_tensors="pt",
         padding=True,

From 2c57e6297bee66b27ba39a14d4889ceddce48b8c Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 11 Feb 2026 22:27:43 -0800
Subject: [PATCH 02/10] update

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/layer_utils.py       |  2 +-
 modelopt/torch/export/unified_export_hf.py | 28 ++++++++++++++--------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
index f9c95ff67..820f94f39 100755
--- a/modelopt/torch/export/layer_utils.py
+++ b/modelopt/torch/export/layer_utils.py
@@ -1752,7 +1752,7 @@ def _split_fused_qkv_weight_and_scaling(
 
     qkv_in = weight.shape[-1] if weight_dim > 1 else 1
 
-    num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+    num_kv_heads = num_kv_heads or num_heads
     assert num_heads % num_kv_heads == 0, (
         f"num_heads({num_heads}) must be divisible by num_kv_heads({num_kv_heads}))."
     )
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 3708cf137..4c95776b8 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -1065,15 +1065,23 @@ def export_hf_checkpoint(
         # Save model
         # Temporarily disable revert_weight_conversion if available — it doesn't handle
         # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
-        _patched_revert = False
-        try:
-            import transformers.core_model_loading as _cml
+        # We must patch both the source module and the importing module since
+        # modeling_utils does `from core_model_loading import revert_weight_conversion`.
+        _patches = []
+        _noop = lambda model, state_dict: state_dict
+        for _mod_path in [
+            "transformers.core_model_loading",
+            "transformers.modeling_utils",
+        ]:
+            try:
+                import importlib
 
-            _original_revert = _cml.revert_weight_conversion
-            _cml.revert_weight_conversion = lambda model, state_dict: state_dict
-            _patched_revert = True
-        except (ImportError, AttributeError):
-            pass
+                _mod = importlib.import_module(_mod_path)
+                if hasattr(_mod, "revert_weight_conversion"):
+                    _patches.append((_mod, getattr(_mod, "revert_weight_conversion")))
+                    setattr(_mod, "revert_weight_conversion", _noop)
+            except (ImportError, AttributeError):
+                pass
 
         try:
             model.save_pretrained(
@@ -1082,8 +1090,8 @@ def export_hf_checkpoint(
                 save_modelopt_state=save_modelopt_state,
             )
         finally:
-            if _patched_revert:
-                _cml.revert_weight_conversion = _original_revert
+            for _mod, _original in _patches:
+                _mod.revert_weight_conversion = _original
 
         original_config = f"{export_dir}/config.json"
         config_data = {}

From ad271ef4e3032cfadb0cfa530239c1094f0ce0c1 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Fri, 13 Feb 2026 16:58:27 -0800
Subject: [PATCH 03/10] swap order to export original tokenizer files

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 3e9cb3706..1b00a4504 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -650,9 +650,6 @@ def export_quantized(
                 extra_state_dict=mtp_state_dict,
             )
 
-        # Copy custom model files (Python files and JSON configs) if trust_remote_code is used
-        copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)
-
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
             tokenizer.padding_side = default_padding_side
@@ -660,6 +657,12 @@ def export_quantized(
                 tokenizer.pad_token = default_pad_token
             tokenizer.save_pretrained(export_path)
 
+        # Copy custom model files (Python files and JSON configs) if trust_remote_code is used.
+        # This must run AFTER tokenizer.save_pretrained() so original tokenizer files
+        # from the source checkpoint take precedence over regenerated ones (which may
+        # differ in format due to newer transformers versions).
+        copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code)
+
         end_time = time.time()
         print(
             f"Quantized model exported to: {export_path}. Total time used {end_time - start_time}s"

From ef7149a7338d179ae1671b2520354f7e6b24df94 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Fri, 13 Feb 2026 16:59:07 -0800
Subject: [PATCH 04/10] swap order to export original tokenizer files

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 4c95776b8..5ea0808ee 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -615,8 +615,7 @@ def _process_quantized_modules(
                         _export_quantized_weight(sub_module, dtype)
                 except AssertionError as e:
                     raise AssertionError(
-                        f"Failed to export module '{name}' "
-                        f"(type={type(sub_module).__name__}): {e}"
+                        f"Failed to export module '{name}' (type={type(sub_module).__name__}): {e}"
                     ) from e
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__

From aa18c20aa5bc14e8f11d0a1cb63a5f0dd4f01724 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 16 Feb 2026 17:13:34 -0800
Subject: [PATCH 05/10] adopt *experts.{id}.* naming pattern

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py    | 10 ---
 .../torch/quantization/plugins/huggingface.py | 69 ++++++++++++-------
 2 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 5ea0808ee..b7ecf0706 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -692,16 +692,6 @@ def _export_transformers_checkpoint(
                                     modules=[linear_module],
                                     quantizer_attrs=["input_quantizer"],
                                 )
-                elif "Qwen3_5MoeExperts" in type(sub_module.experts).__name__:
-                    # Handle Qwen3.5 MoE experts which use gate_proj/up_proj/down_proj ModuleLists
-                    for expert_linear_name in ["gate_proj", "up_proj", "down_proj"]:
-                        if hasattr(sub_module.experts, expert_linear_name):
-                            linear_modulelist = getattr(sub_module.experts, expert_linear_name)
-                            if hasattr(linear_modulelist, "__iter__"):
-                                set_expert_quantizer_amax(
-                                    modules=list(linear_modulelist),
-                                    quantizer_attrs=["input_quantizer"],
-                                )
                 elif isinstance(sub_module.experts, collections.abc.Iterable):
                     # For other MoE models (like Mixtral) with iterable experts
                     try:
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index a7cb46644..068665dae 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -734,9 +734,27 @@ def forward(
         return next_states
 
 
+class _Qwen3_5MoeExpertModule(nn.Module):
+    """Container for a single Qwen3.5 MoE expert's linear layers.
+
+    Produces the naming pattern: experts.{id}.gate_proj.weight
+    (consistent with standard Qwen3 MoE per-expert module structure).
+    """
+
+    def __init__(self, hidden_dim: int, expert_dim: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_dim, expert_dim, bias=False)
+        self.up_proj = nn.Linear(hidden_dim, expert_dim, bias=False)
+        self.down_proj = nn.Linear(expert_dim, hidden_dim, bias=False)
+
+
 class _QuantQwen3_5MoeExperts(QuantModule):
     def _setup(self):
-        """Modify the Qwen3_5MoeExperts by using nn.Linear layers."""
+        """Modify the Qwen3_5MoeExperts by using per-expert nn.Module containers.
+
+        This produces the naming pattern: experts.{id}.gate_proj.weight
+        (consistent with standard Qwen3 MoE).
+        """
         from accelerate import init_empty_weights
 
         dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device
@@ -749,21 +767,9 @@ def _copy_weight(module, weight):
         expert_dim = self.intermediate_dim
 
         with init_empty_weights():
-            gate_proj = nn.ModuleList(
-                [
-                    nn.Linear(self.hidden_dim, expert_dim, bias=False)
-                    for _ in range(self.num_experts)
-                ]
-            )
-            up_proj = nn.ModuleList(
-                [
-                    nn.Linear(self.hidden_dim, expert_dim, bias=False)
-                    for _ in range(self.num_experts)
-                ]
-            )
-            down_proj = nn.ModuleList(
+            expert_modules = nn.ModuleList(
                 [
-                    nn.Linear(expert_dim, self.hidden_dim, bias=False)
+                    _Qwen3_5MoeExpertModule(self.hidden_dim, expert_dim)
                     for _ in range(self.num_experts)
                 ]
             )
@@ -771,17 +777,31 @@ def _copy_weight(module, weight):
         for idx in range(self.num_experts):
             # gate_up_proj shape: (num_experts, 2*intermediate_dim, hidden_dim)
             # Already in (out_features, in_features) format, no transpose needed
-            _copy_weight(gate_proj[idx], self.gate_up_proj[idx, :expert_dim, :])
-            _copy_weight(up_proj[idx], self.gate_up_proj[idx, expert_dim:, :])
+            _copy_weight(expert_modules[idx].gate_proj, self.gate_up_proj[idx, :expert_dim, :])
+            _copy_weight(expert_modules[idx].up_proj, self.gate_up_proj[idx, expert_dim:, :])
             # down_proj shape: (num_experts, hidden_dim, intermediate_dim)
             # Already in (out_features, in_features) format
-            _copy_weight(down_proj[idx], self.down_proj[idx])
+            _copy_weight(expert_modules[idx].down_proj, self.down_proj[idx])
 
         delattr(self, "gate_up_proj")
         delattr(self, "down_proj")
-        self.gate_proj = gate_proj
-        self.up_proj = up_proj
-        self.down_proj = down_proj
+        # Register expert modules directly as numbered children (like nn.ModuleList)
+        # so the naming pattern is: experts.{id}.gate_proj.weight (no extra nesting)
+        for idx in range(self.num_experts):
+            self.add_module(str(idx), expert_modules[idx])
+
+    def __len__(self):
+        """Support len() so the module is iterable like standard MoE experts."""
+        return self.num_experts
+
+    def __iter__(self):
+        """Support iteration over expert modules."""
+        for idx in range(self.num_experts):
+            yield getattr(self, str(idx))
+
+    def __getitem__(self, idx):
+        """Support indexing to get individual expert modules."""
+        return getattr(self, str(int(idx)))
 
     def forward(
         self,
@@ -801,10 +821,11 @@ def forward(
             with torch.no_grad():
                 top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
             current_state = hidden_states[token_idx]
-            gate = self.gate_proj[expert_idx](current_state)
-            up = self.up_proj[expert_idx](current_state)
+            expert = self[expert_idx]
+            gate = expert.gate_proj(current_state)
+            up = expert.up_proj(current_state)
             current_hidden_states = self.act_fn(gate) * up
-            current_hidden_states = self.down_proj[expert_idx](current_hidden_states)
+            current_hidden_states = expert.down_proj(current_hidden_states)
             current_hidden_states = (
                 current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
             )

From 6e8514189133cc48efcc960f71834cbb37e123dc Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 16 Feb 2026 17:19:04 -0800
Subject: [PATCH 06/10] adopt *experts.{id}.* naming pattern

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py    | 49 ++++++++++++-------
 .../torch/quantization/plugins/huggingface.py | 12 ++---
 2 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index b7ecf0706..7878d8ce7 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -993,6 +993,36 @@ def _export_diffusers_checkpoint(
     print(f"Export complete. Saved to: {export_dir}")
 
 
+def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict:
+    """No-op replacement for transformers' revert_weight_conversion."""
+    return state_dict
+
+
+def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]:
+    """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors."""
+    import importlib
+
+    patches: list[tuple[Any, Any]] = []
+    for mod_path in [
+        "transformers.core_model_loading",
+        "transformers.modeling_utils",
+    ]:
+        try:
+            mod = importlib.import_module(mod_path)
+            if hasattr(mod, "revert_weight_conversion"):
+                patches.append((mod, getattr(mod, "revert_weight_conversion")))
+                setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop)
+        except (ImportError, AttributeError):
+            pass
+    return patches
+
+
+def _unpatch_revert_weight_conversion(patches: list[tuple[Any, Any]]) -> None:
+    """Restore the original revert_weight_conversion functions."""
+    for mod, original in patches:
+        mod.revert_weight_conversion = original
+
+
 def export_hf_checkpoint(
     model: Any,
     dtype: torch.dtype | None = None,
@@ -1056,21 +1086,7 @@ def export_hf_checkpoint(
         # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
         # We must patch both the source module and the importing module since
         # modeling_utils does `from core_model_loading import revert_weight_conversion`.
-        _patches = []
-        _noop = lambda model, state_dict: state_dict
-        for _mod_path in [
-            "transformers.core_model_loading",
-            "transformers.modeling_utils",
-        ]:
-            try:
-                import importlib
-
-                _mod = importlib.import_module(_mod_path)
-                if hasattr(_mod, "revert_weight_conversion"):
-                    _patches.append((_mod, getattr(_mod, "revert_weight_conversion")))
-                    setattr(_mod, "revert_weight_conversion", _noop)
-            except (ImportError, AttributeError):
-                pass
+        _patches = _patch_revert_weight_conversion()
 
         try:
             model.save_pretrained(
@@ -1079,8 +1095,7 @@ def export_hf_checkpoint(
                 save_modelopt_state=save_modelopt_state,
             )
         finally:
-            for _mod, _original in _patches:
-                _mod.revert_weight_conversion = _original
+            _unpatch_revert_weight_conversion(_patches)
 
         original_config = f"{export_dir}/config.json"
         config_data = {}
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index 068665dae..4bc07996b 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -734,7 +734,7 @@ def forward(
         return next_states
 
 
-class _Qwen3_5MoeExpertModule(nn.Module):
+class _Qwen35MoeExpertModule(nn.Module):
     """Container for a single Qwen3.5 MoE expert's linear layers.
 
     Produces the naming pattern: experts.{id}.gate_proj.weight
@@ -748,7 +748,7 @@ def __init__(self, hidden_dim: int, expert_dim: int):
         self.down_proj = nn.Linear(expert_dim, hidden_dim, bias=False)
 
 
-class _QuantQwen3_5MoeExperts(QuantModule):
+class _QuantQwen35MoeExperts(QuantModule):
     def _setup(self):
         """Modify the Qwen3_5MoeExperts by using per-expert nn.Module containers.
 
@@ -769,7 +769,7 @@ def _copy_weight(module, weight):
         with init_empty_weights():
             expert_modules = nn.ModuleList(
                 [
-                    _Qwen3_5MoeExpertModule(self.hidden_dim, expert_dim)
+                    _Qwen35MoeExpertModule(self.hidden_dim, expert_dim)
                     for _ in range(self.num_experts)
                 ]
             )
@@ -983,7 +983,7 @@ def unpack_weight(self):
     pass
 
 
-class _QuantQwen3_5MoeSparseMoeBlock(_QuantSparseMoe):
+class _QuantQwen35MoeSparseMoeBlock(_QuantSparseMoe):
     """Qwen3.5 MoE stores top_k/num_experts in the router (self.gate), not as direct attributes.
 
     We override forward instead of just bridging attributes because the router (self.gate)
@@ -1012,12 +1012,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     if Qwen3_5MoeSparseMoeBlock not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3_5MoeSparseMoeBlock: "hf.Qwen3_5MoeSparseMoeBlock"})(
-            _QuantQwen3_5MoeSparseMoeBlock
+            _QuantQwen35MoeSparseMoeBlock
         )
 
     if Qwen3_5MoeExperts not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})(
-            _QuantQwen3_5MoeExperts
+            _QuantQwen35MoeExperts
         )
 except ImportError:
     pass

From 811ef61cae8c1dba871ea723bdc40050adf074f7 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 16 Feb 2026 17:20:21 -0800
Subject: [PATCH 07/10] adopt *experts.{id}.* naming pattern

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py | 27 ++++++++++++++--------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 7878d8ce7..e965d14ed 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -998,22 +998,31 @@ def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict:
     return state_dict
 
 
-def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]:
-    """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors."""
+def _try_patch_module(mod_path: str) -> tuple[Any, Any] | None:
+    """Try to patch revert_weight_conversion in a single module."""
     import importlib
 
+    try:
+        mod = importlib.import_module(mod_path)
+        if hasattr(mod, "revert_weight_conversion"):
+            original = getattr(mod, "revert_weight_conversion")
+            setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop)
+            return (mod, original)
+    except (ImportError, AttributeError):
+        pass
+    return None
+
+
+def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]:
+    """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors."""
     patches: list[tuple[Any, Any]] = []
     for mod_path in [
         "transformers.core_model_loading",
         "transformers.modeling_utils",
     ]:
-        try:
-            mod = importlib.import_module(mod_path)
-            if hasattr(mod, "revert_weight_conversion"):
-                patches.append((mod, getattr(mod, "revert_weight_conversion")))
-                setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop)
-        except (ImportError, AttributeError):
-            pass
+        result = _try_patch_module(mod_path)
+        if result is not None:
+            patches.append(result)
     return patches
 
 

From bd029e5703c3af842fd597651d44b852dfb84938 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 25 Feb 2026 22:29:06 -0800
Subject: [PATCH 08/10] address reviews

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/layer_utils.py          | 21 ++++--------
 .../torch/quantization/plugins/huggingface.py | 34 +++----------------
 2 files changed, 10 insertions(+), 45 deletions(-)

diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
index 820f94f39..acb5b9ac2 100755
--- a/modelopt/torch/export/layer_utils.py
+++ b/modelopt/torch/export/layer_utils.py
@@ -327,21 +327,12 @@ def is_mlp(module: nn.Module) -> bool:
 
 def is_moe(module: nn.Module) -> bool:
     """Returns whether the module is an MOE layer."""
-    return any(
-        key in type(module).__name__.lower()
-        for key in [
-            "MixtralSparseMoeBlock".lower(),
-            "ArcticMoE".lower(),
-            "DbrxFFN".lower(),
-            "MoELayer".lower(),
-            "PhimoeSparseMoeBlock".lower(),
-            "DeepseekMoE".lower(),
-            "Qwen2MoeSparseMoeBlock".lower(),
-            "Qwen3MoeSparseMoeBlock".lower(),
-            "Qwen3NextSparseMoeBlock".lower(),
-            "Qwen3_5MoeSparseMoeBlock".lower(),
-        ]
-    )
+    name = type(module).__name__.lower()
+    # Auto-detect common MoE patterns
+    if name.endswith("sparsemoeblock") or "moelayer" in name:
+        return True
+    # Explicit matches for non-standard naming
+    return any(key in name for key in ["arcticmoe", "deepseekmoe", "dbrxffn"])
 
 
 def is_quantlinear(module: nn.Module) -> bool:
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index 4bc07996b..321b5a1ac 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -983,38 +983,12 @@ def unpack_weight(self):
     pass
 
 
-class _QuantQwen35MoeSparseMoeBlock(_QuantSparseMoe):
-    """Qwen3.5 MoE stores top_k/num_experts in the router (self.gate), not as direct attributes.
-
-    We override forward instead of just bridging attributes because the router (self.gate)
-    uses its own top_k internally for routing decisions. We must modify self.gate.top_k
-    directly so all experts see calibration data.
-    """
-
-    def _setup(self):
-        self.num_experts = self.experts.num_experts
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        if any(getattr(m, "_if_calib", False) for m in self.experts.modules()):
-            # Force all tokens to all experts during calibration
-            original_top_k = self.gate.top_k
-            self.gate.top_k = self.num_experts
-            super(_QuantSparseMoe, self).forward(hidden_states)
-            self.gate.top_k = original_top_k
-        return super(_QuantSparseMoe, self).forward(hidden_states)
-
-
 try:
-    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
-        Qwen3_5MoeExperts,
-        Qwen3_5MoeSparseMoeBlock,
-    )
-
-    if Qwen3_5MoeSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register({Qwen3_5MoeSparseMoeBlock: "hf.Qwen3_5MoeSparseMoeBlock"})(
-            _QuantQwen35MoeSparseMoeBlock
-        )
+    from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeExperts
 
+    # Qwen3_5MoeSparseMoeBlock registration is handled by register_sparse_moe_on_the_fly
+    # (auto-detected via gate.top_k + gate.num_experts + experts pattern).
+    # Only the fused expert weights need explicit registration.
     if Qwen3_5MoeExperts not in QuantModuleRegistry:
         QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})(
             _QuantQwen35MoeExperts

From 86d35a5a7b3e0e5c2bf3b923718b6799c840bdc8 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 26 Feb 2026 00:33:01 -0800
Subject: [PATCH 09/10] minor

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index e965d14ed..4e9e3ba32 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -993,6 +993,11 @@ def _export_diffusers_checkpoint(
     print(f"Export complete. Saved to: {export_dir}")
 
 
+# TODO: Remove this workaround once HuggingFace fixes revert_weight_conversion to handle
+# scalar (0-d) tensors. The bug is in transformers' Chunk.convert() which calls
+# tensor.size(self.dim) on quantization scale buffers that are 0-d scalars, causing
+# IndexError. Confirmed still present in transformers 5.2.0.
+# See: transformers/core_model_loading.py, Chunk.convert()
 def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict:
     """No-op replacement for transformers' revert_weight_conversion."""
     return state_dict

From c7bb2916ac39d62ae9561eee3b46cf241b7fd476 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 26 Feb 2026 13:02:45 -0800
Subject: [PATCH 10/10] update doc

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 CHANGELOG.rst              | 1 +
 examples/llm_ptq/README.md | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 5d6b61c21..e08058013 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -16,6 +16,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
 - Add support for rotating the input before quantization for RHT.
 - Add support for advanced weight scale search for NVFP4 quantization and its export path.
+- Enable PTQ workflow for Qwen3.5 MoE models.
 
 0.42 (2026-02-xx)
 ^^^^^^^^^^^^^^^^^
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
index 187eed7f1..7a9a71f88 100755
--- a/examples/llm_ptq/README.md
+++ b/examples/llm_ptq/README.md
@@ -106,7 +106,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http
 | Llama-Nemotron Ultra | ✅ | ❌ | ❌ | ❌ | ❌ |
 | Gemma 3 | ✅<sup>2</sup> | - | ✅ | - | - |
 | QWen 2, 2.5 <sup>4</sup> | ✅ | ✅ | ✅ | ✅ | ✅ |
-| QWen3 MOE, Next <sup>6</sup> | ✅ | - | - | - | ✅ |
+| QWen3, 3.5 MOE, Next <sup>6</sup> | ✅ | - | - | - | ✅ |
 | QwQ | ✅ | - | - | - | ✅ |
 | DeepSeek V3, R1, V3.1, V3.2<sup>7</sup> | - | - | - | - | ✅ |
 | GLM-4.7<sup>8</sup> | ✅ | - | - | - | ✅ |
@@ -402,6 +402,7 @@ print(llm_fp8.generate(["What's the age of the earth? "]))
 | QWen3 | FP4 | ✅ | ✅ | - |
 | QWen3 MoE | FP8 | ✅ | ✅ | ✅ |
 | QWen3 MoE | FP4 | ✅ | - | - |
+| QWen3.5 MoE | FP4 | - | - | ✅ |
 | QWen2.5 | FP8 | ✅ | ✅ | ✅ |
 | QWen2.5 | FP4 | ✅ | ✅ | - |
 | QwQ-32B | FP8 | ✅ | ✅ | ✅ |