From 4b6b9cb8f9e5a9d6931ba38ebba1a57dd446c901 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 11 Feb 2026 22:14:55 -0800 Subject: [PATCH 01/10] support qwen3.5 MOE PTQ Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/layer_utils.py | 7 +- modelopt/torch/export/unified_export_hf.py | 48 +++++-- .../torch/quantization/plugins/huggingface.py | 120 ++++++++++++++++++ modelopt/torch/utils/dataset_utils.py | 4 +- 4 files changed, 168 insertions(+), 11 deletions(-) diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index 9c68899d9..f9c95ff67 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -339,6 +339,7 @@ def is_moe(module: nn.Module) -> bool: "Qwen2MoeSparseMoeBlock".lower(), "Qwen3MoeSparseMoeBlock".lower(), "Qwen3NextSparseMoeBlock".lower(), + "Qwen3_5MoeSparseMoeBlock".lower(), ] ) @@ -1006,6 +1007,7 @@ def module_match_name_list(module, name_list): "Qwen2MoeSparseMoeBlock", "Qwen3MoeSparseMoeBlock", "Qwen3NextSparseMoeBlock", + "Qwen3_5MoeSparseMoeBlock", "DeepseekMoE", ], ): @@ -1141,7 +1143,10 @@ def set_expert_quantizer_amax( # Apply target amax to quantizers that need it for module, attr_name, quantizer in all_quantizers: # Check if quantizer needs amax (use property for consistency) - needs_amax = getattr(quantizer, "amax", None) is None + # Also treat zero amax as needing recalibration — a zero amax is never valid + # and indicates the quantizer wasn't activated during calibration + amax = getattr(quantizer, "amax", None) + needs_amax = amax is None or (isinstance(amax, torch.Tensor) and torch.all(amax == 0)) # Skip dynamic quantizers for input quantizers if "input_quantizer" in attr_name and getattr(quantizer, "_dynamic", False): diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 36a01bd73..3708cf137 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -589,7 +589,7 @@ def _process_quantized_modules( """ fsdp_module_to_reshard = None - for _, sub_module in model.named_modules(): + for name, sub_module in model.named_modules(): # Optimization to perform resharding only once per decoder layer to avoid extra communication overhead if isinstance(sub_module, FSDPModule): # Every time we encounter a new FSDPModule, the previous decoder layer is fully processed. @@ -610,8 +610,14 @@ def _process_quantized_modules( sub_module.unpack_weight() if get_quantization_format(sub_module) != QUANTIZATION_NONE: if is_quantlinear(sub_module): - with fsdp2_aware_weight_update(model, sub_module, reshard=False): - _export_quantized_weight(sub_module, dtype) + try: + with fsdp2_aware_weight_update(model, sub_module, reshard=False): + _export_quantized_weight(sub_module, dtype) + except AssertionError as e: + raise AssertionError( + f"Failed to export module '{name}' " + f"(type={type(sub_module).__name__}): {e}" + ) from e elif ( "Llama4TextExperts" in type(sub_module).__name__ or "GptOssExperts" in type(sub_module).__name__ @@ -687,6 +693,16 @@ def _export_transformers_checkpoint( modules=[linear_module], quantizer_attrs=["input_quantizer"], ) + elif "Qwen3_5MoeExperts" in type(sub_module.experts).__name__: + # Handle Qwen3.5 MoE experts which use gate_proj/up_proj/down_proj ModuleLists + for expert_linear_name in ["gate_proj", "up_proj", "down_proj"]: + if hasattr(sub_module.experts, expert_linear_name): + linear_modulelist = getattr(sub_module.experts, expert_linear_name) + if hasattr(linear_modulelist, "__iter__"): + set_expert_quantizer_amax( + modules=list(linear_modulelist), + quantizer_attrs=["input_quantizer"], + ) elif isinstance(sub_module.experts, collections.abc.Iterable): # For other MoE models (like Mixtral) with iterable experts try: @@ -1047,11 +1063,27 @@ def export_hf_checkpoint( model.hf_quantizer = None # Save model - model.save_pretrained( - export_dir, - state_dict={**post_state_dict, **(extra_state_dict or {})}, - save_modelopt_state=save_modelopt_state, - ) + # Temporarily disable revert_weight_conversion if available — it doesn't handle + # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError). + _patched_revert = False + try: + import transformers.core_model_loading as _cml + + _original_revert = _cml.revert_weight_conversion + _cml.revert_weight_conversion = lambda model, state_dict: state_dict + _patched_revert = True + except (ImportError, AttributeError): + pass + + try: + model.save_pretrained( + export_dir, + state_dict={**post_state_dict, **(extra_state_dict or {})}, + save_modelopt_state=save_modelopt_state, + ) + finally: + if _patched_revert: + _cml.revert_weight_conversion = _original_revert original_config = f"{export_dir}/config.json" config_data = {} diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index d58c388a1..a7cb46644 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -734,6 +734,86 @@ def forward( return next_states +class _QuantQwen3_5MoeExperts(QuantModule): + def _setup(self): + """Modify the Qwen3_5MoeExperts by using nn.Linear layers.""" + from accelerate import init_empty_weights + + dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device + + def _copy_weight(module, weight): + module.to_empty(device=device) + with torch.no_grad(): + module.weight.data = weight.detach().data.to(dtype=dtype, device=device) + + expert_dim = self.intermediate_dim + + with init_empty_weights(): + gate_proj = nn.ModuleList( + [ + nn.Linear(self.hidden_dim, expert_dim, bias=False) + for _ in range(self.num_experts) + ] + ) + up_proj = nn.ModuleList( + [ + nn.Linear(self.hidden_dim, expert_dim, bias=False) + for _ in range(self.num_experts) + ] + ) + down_proj = nn.ModuleList( + [ + nn.Linear(expert_dim, self.hidden_dim, bias=False) + for _ in range(self.num_experts) + ] + ) + + for idx in range(self.num_experts): + # gate_up_proj shape: (num_experts, 2*intermediate_dim, hidden_dim) + # Already in (out_features, in_features) format, no transpose needed + _copy_weight(gate_proj[idx], self.gate_up_proj[idx, :expert_dim, :]) + _copy_weight(up_proj[idx], self.gate_up_proj[idx, expert_dim:, :]) + # down_proj shape: (num_experts, hidden_dim, intermediate_dim) + # Already in (out_features, in_features) format + _copy_weight(down_proj[idx], self.down_proj[idx]) + + delattr(self, "gate_up_proj") + delattr(self, "down_proj") + self.gate_proj = gate_proj + self.up_proj = up_proj + self.down_proj = down_proj + + def forward( + self, + hidden_states: torch.Tensor, + top_k_index: torch.Tensor, + top_k_weights: torch.Tensor, + ) -> torch.Tensor: + final_hidden_states = torch.zeros_like(hidden_states) + with torch.no_grad(): + expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts) + expert_mask = expert_mask.permute(2, 1, 0) + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit: + expert_idx = expert_idx[0] + if expert_idx == self.num_experts: + continue + with torch.no_grad(): + top_k_pos, token_idx = torch.where(expert_mask[expert_idx]) + current_state = hidden_states[token_idx] + gate = self.gate_proj[expert_idx](current_state) + up = self.up_proj[expert_idx](current_state) + current_hidden_states = self.act_fn(gate) * up + current_hidden_states = self.down_proj[expert_idx](current_hidden_states) + current_hidden_states = ( + current_hidden_states * top_k_weights[token_idx, top_k_pos, None] + ) + final_hidden_states.index_add_( + 0, token_idx, current_hidden_states.to(final_hidden_states.dtype) + ) + return final_hidden_states + + class _QuantDbrxFFN(_QuantSparseMoe): @property def num_experts(self): @@ -882,6 +962,46 @@ def unpack_weight(self): pass +class _QuantQwen3_5MoeSparseMoeBlock(_QuantSparseMoe): + """Qwen3.5 MoE stores top_k/num_experts in the router (self.gate), not as direct attributes. + + We override forward instead of just bridging attributes because the router (self.gate) + uses its own top_k internally for routing decisions. We must modify self.gate.top_k + directly so all experts see calibration data. + """ + + def _setup(self): + self.num_experts = self.experts.num_experts + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + if any(getattr(m, "_if_calib", False) for m in self.experts.modules()): + # Force all tokens to all experts during calibration + original_top_k = self.gate.top_k + self.gate.top_k = self.num_experts + super(_QuantSparseMoe, self).forward(hidden_states) + self.gate.top_k = original_top_k + return super(_QuantSparseMoe, self).forward(hidden_states) + + +try: + from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import ( + Qwen3_5MoeExperts, + Qwen3_5MoeSparseMoeBlock, + ) + + if Qwen3_5MoeSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register({Qwen3_5MoeSparseMoeBlock: "hf.Qwen3_5MoeSparseMoeBlock"})( + _QuantQwen3_5MoeSparseMoeBlock + ) + + if Qwen3_5MoeExperts not in QuantModuleRegistry: + QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})( + _QuantQwen3_5MoeExperts + ) +except ImportError: + pass + + class _QuantGptOssExperts(_QuantFunctionalMixin): """Quantized wrapper for `transformers.GptOssExperts`. diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 0da8ad726..e71a01cd1 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -298,7 +298,7 @@ def get_dataset_dataloader( An instance of dataloader. """ assert tokenizer is not None, "Please provide a tokenizer." - # batch_encode_plus will modify the tokenizer in place, so we need to clone it. + # Tokenizer encoding may modify the tokenizer in place, so we need to clone it. tokenizer = copy.deepcopy(tokenizer) if tokenizer.padding_side != "left": @@ -323,7 +323,7 @@ def get_dataset_dataloader( ) all_samples.extend(samples) - batch_encoded = tokenizer.batch_encode_plus( + batch_encoded = tokenizer( all_samples, return_tensors="pt", padding=True, From 2c57e6297bee66b27ba39a14d4889ceddce48b8c Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 11 Feb 2026 22:27:43 -0800 Subject: [PATCH 02/10] update Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/layer_utils.py | 2 +- modelopt/torch/export/unified_export_hf.py | 28 ++++++++++++++-------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index f9c95ff67..820f94f39 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -1752,7 +1752,7 @@ def _split_fused_qkv_weight_and_scaling( qkv_in = weight.shape[-1] if weight_dim > 1 else 1 - num_kv_heads = num_kv_heads if num_kv_heads else num_heads + num_kv_heads = num_kv_heads or num_heads assert num_heads % num_kv_heads == 0, ( f"num_heads({num_heads}) must be divisible by num_kv_heads({num_kv_heads}))." ) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 3708cf137..4c95776b8 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -1065,15 +1065,23 @@ def export_hf_checkpoint( # Save model # Temporarily disable revert_weight_conversion if available — it doesn't handle # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError). - _patched_revert = False - try: - import transformers.core_model_loading as _cml + # We must patch both the source module and the importing module since + # modeling_utils does `from core_model_loading import revert_weight_conversion`. + _patches = [] + _noop = lambda model, state_dict: state_dict + for _mod_path in [ + "transformers.core_model_loading", + "transformers.modeling_utils", + ]: + try: + import importlib - _original_revert = _cml.revert_weight_conversion - _cml.revert_weight_conversion = lambda model, state_dict: state_dict - _patched_revert = True - except (ImportError, AttributeError): - pass + _mod = importlib.import_module(_mod_path) + if hasattr(_mod, "revert_weight_conversion"): + _patches.append((_mod, getattr(_mod, "revert_weight_conversion"))) + setattr(_mod, "revert_weight_conversion", _noop) + except (ImportError, AttributeError): + pass try: model.save_pretrained( @@ -1082,8 +1090,8 @@ def export_hf_checkpoint( save_modelopt_state=save_modelopt_state, ) finally: - if _patched_revert: - _cml.revert_weight_conversion = _original_revert + for _mod, _original in _patches: + _mod.revert_weight_conversion = _original original_config = f"{export_dir}/config.json" config_data = {} From ad271ef4e3032cfadb0cfa530239c1094f0ce0c1 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Fri, 13 Feb 2026 16:58:27 -0800 Subject: [PATCH 03/10] swap order to export original tokenizer files Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/hf_ptq.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 3e9cb3706..1b00a4504 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -650,9 +650,6 @@ def export_quantized( extra_state_dict=mtp_state_dict, ) - # Copy custom model files (Python files and JSON configs) if trust_remote_code is used - copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code) - # Restore default padding and export the tokenizer as well. if tokenizer is not None: tokenizer.padding_side = default_padding_side @@ -660,6 +657,12 @@ def export_quantized( tokenizer.pad_token = default_pad_token tokenizer.save_pretrained(export_path) + # Copy custom model files (Python files and JSON configs) if trust_remote_code is used. + # This must run AFTER tokenizer.save_pretrained() so original tokenizer files + # from the source checkpoint take precedence over regenerated ones (which may + # differ in format due to newer transformers versions). + copy_custom_model_files(args.pyt_ckpt_path, export_path, args.trust_remote_code) + end_time = time.time() print( f"Quantized model exported to: {export_path}. Total time used {end_time - start_time}s" From ef7149a7338d179ae1671b2520354f7e6b24df94 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Fri, 13 Feb 2026 16:59:07 -0800 Subject: [PATCH 04/10] swap order to export original tokenizer files Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/unified_export_hf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 4c95776b8..5ea0808ee 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -615,8 +615,7 @@ def _process_quantized_modules( _export_quantized_weight(sub_module, dtype) except AssertionError as e: raise AssertionError( - f"Failed to export module '{name}' " - f"(type={type(sub_module).__name__}): {e}" + f"Failed to export module '{name}' (type={type(sub_module).__name__}): {e}" ) from e elif ( "Llama4TextExperts" in type(sub_module).__name__ From aa18c20aa5bc14e8f11d0a1cb63a5f0dd4f01724 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Mon, 16 Feb 2026 17:13:34 -0800 Subject: [PATCH 05/10] adopt *experts.{id}.* naming pattern Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/unified_export_hf.py | 10 --- .../torch/quantization/plugins/huggingface.py | 69 ++++++++++++------- 2 files changed, 45 insertions(+), 34 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 5ea0808ee..b7ecf0706 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -692,16 +692,6 @@ def _export_transformers_checkpoint( modules=[linear_module], quantizer_attrs=["input_quantizer"], ) - elif "Qwen3_5MoeExperts" in type(sub_module.experts).__name__: - # Handle Qwen3.5 MoE experts which use gate_proj/up_proj/down_proj ModuleLists - for expert_linear_name in ["gate_proj", "up_proj", "down_proj"]: - if hasattr(sub_module.experts, expert_linear_name): - linear_modulelist = getattr(sub_module.experts, expert_linear_name) - if hasattr(linear_modulelist, "__iter__"): - set_expert_quantizer_amax( - modules=list(linear_modulelist), - quantizer_attrs=["input_quantizer"], - ) elif isinstance(sub_module.experts, collections.abc.Iterable): # For other MoE models (like Mixtral) with iterable experts try: diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index a7cb46644..068665dae 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -734,9 +734,27 @@ def forward( return next_states +class _Qwen3_5MoeExpertModule(nn.Module): + """Container for a single Qwen3.5 MoE expert's linear layers. + + Produces the naming pattern: experts.{id}.gate_proj.weight + (consistent with standard Qwen3 MoE per-expert module structure). + """ + + def __init__(self, hidden_dim: int, expert_dim: int): + super().__init__() + self.gate_proj = nn.Linear(hidden_dim, expert_dim, bias=False) + self.up_proj = nn.Linear(hidden_dim, expert_dim, bias=False) + self.down_proj = nn.Linear(expert_dim, hidden_dim, bias=False) + + class _QuantQwen3_5MoeExperts(QuantModule): def _setup(self): - """Modify the Qwen3_5MoeExperts by using nn.Linear layers.""" + """Modify the Qwen3_5MoeExperts by using per-expert nn.Module containers. + + This produces the naming pattern: experts.{id}.gate_proj.weight + (consistent with standard Qwen3 MoE). + """ from accelerate import init_empty_weights dtype, device = self.gate_up_proj.dtype, self.gate_up_proj.device @@ -749,21 +767,9 @@ def _copy_weight(module, weight): expert_dim = self.intermediate_dim with init_empty_weights(): - gate_proj = nn.ModuleList( - [ - nn.Linear(self.hidden_dim, expert_dim, bias=False) - for _ in range(self.num_experts) - ] - ) - up_proj = nn.ModuleList( - [ - nn.Linear(self.hidden_dim, expert_dim, bias=False) - for _ in range(self.num_experts) - ] - ) - down_proj = nn.ModuleList( + expert_modules = nn.ModuleList( [ - nn.Linear(expert_dim, self.hidden_dim, bias=False) + _Qwen3_5MoeExpertModule(self.hidden_dim, expert_dim) for _ in range(self.num_experts) ] ) @@ -771,17 +777,31 @@ def _copy_weight(module, weight): for idx in range(self.num_experts): # gate_up_proj shape: (num_experts, 2*intermediate_dim, hidden_dim) # Already in (out_features, in_features) format, no transpose needed - _copy_weight(gate_proj[idx], self.gate_up_proj[idx, :expert_dim, :]) - _copy_weight(up_proj[idx], self.gate_up_proj[idx, expert_dim:, :]) + _copy_weight(expert_modules[idx].gate_proj, self.gate_up_proj[idx, :expert_dim, :]) + _copy_weight(expert_modules[idx].up_proj, self.gate_up_proj[idx, expert_dim:, :]) # down_proj shape: (num_experts, hidden_dim, intermediate_dim) # Already in (out_features, in_features) format - _copy_weight(down_proj[idx], self.down_proj[idx]) + _copy_weight(expert_modules[idx].down_proj, self.down_proj[idx]) delattr(self, "gate_up_proj") delattr(self, "down_proj") - self.gate_proj = gate_proj - self.up_proj = up_proj - self.down_proj = down_proj + # Register expert modules directly as numbered children (like nn.ModuleList) + # so the naming pattern is: experts.{id}.gate_proj.weight (no extra nesting) + for idx in range(self.num_experts): + self.add_module(str(idx), expert_modules[idx]) + + def __len__(self): + """Support len() so the module is iterable like standard MoE experts.""" + return self.num_experts + + def __iter__(self): + """Support iteration over expert modules.""" + for idx in range(self.num_experts): + yield getattr(self, str(idx)) + + def __getitem__(self, idx): + """Support indexing to get individual expert modules.""" + return getattr(self, str(int(idx))) def forward( self, @@ -801,10 +821,11 @@ def forward( with torch.no_grad(): top_k_pos, token_idx = torch.where(expert_mask[expert_idx]) current_state = hidden_states[token_idx] - gate = self.gate_proj[expert_idx](current_state) - up = self.up_proj[expert_idx](current_state) + expert = self[expert_idx] + gate = expert.gate_proj(current_state) + up = expert.up_proj(current_state) current_hidden_states = self.act_fn(gate) * up - current_hidden_states = self.down_proj[expert_idx](current_hidden_states) + current_hidden_states = expert.down_proj(current_hidden_states) current_hidden_states = ( current_hidden_states * top_k_weights[token_idx, top_k_pos, None] ) From 6e8514189133cc48efcc960f71834cbb37e123dc Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Mon, 16 Feb 2026 17:19:04 -0800 Subject: [PATCH 06/10] adopt *experts.{id}.* naming pattern Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/unified_export_hf.py | 49 ++++++++++++------- .../torch/quantization/plugins/huggingface.py | 12 ++--- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index b7ecf0706..7878d8ce7 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -993,6 +993,36 @@ def _export_diffusers_checkpoint( print(f"Export complete. Saved to: {export_dir}") +def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict: + """No-op replacement for transformers' revert_weight_conversion.""" + return state_dict + + +def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]: + """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors.""" + import importlib + + patches: list[tuple[Any, Any]] = [] + for mod_path in [ + "transformers.core_model_loading", + "transformers.modeling_utils", + ]: + try: + mod = importlib.import_module(mod_path) + if hasattr(mod, "revert_weight_conversion"): + patches.append((mod, getattr(mod, "revert_weight_conversion"))) + setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop) + except (ImportError, AttributeError): + pass + return patches + + +def _unpatch_revert_weight_conversion(patches: list[tuple[Any, Any]]) -> None: + """Restore the original revert_weight_conversion functions.""" + for mod, original in patches: + mod.revert_weight_conversion = original + + def export_hf_checkpoint( model: Any, dtype: torch.dtype | None = None, @@ -1056,21 +1086,7 @@ def export_hf_checkpoint( # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError). # We must patch both the source module and the importing module since # modeling_utils does `from core_model_loading import revert_weight_conversion`. - _patches = [] - _noop = lambda model, state_dict: state_dict - for _mod_path in [ - "transformers.core_model_loading", - "transformers.modeling_utils", - ]: - try: - import importlib - - _mod = importlib.import_module(_mod_path) - if hasattr(_mod, "revert_weight_conversion"): - _patches.append((_mod, getattr(_mod, "revert_weight_conversion"))) - setattr(_mod, "revert_weight_conversion", _noop) - except (ImportError, AttributeError): - pass + _patches = _patch_revert_weight_conversion() try: model.save_pretrained( @@ -1079,8 +1095,7 @@ def export_hf_checkpoint( save_modelopt_state=save_modelopt_state, ) finally: - for _mod, _original in _patches: - _mod.revert_weight_conversion = _original + _unpatch_revert_weight_conversion(_patches) original_config = f"{export_dir}/config.json" config_data = {} diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 068665dae..4bc07996b 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -734,7 +734,7 @@ def forward( return next_states -class _Qwen3_5MoeExpertModule(nn.Module): +class _Qwen35MoeExpertModule(nn.Module): """Container for a single Qwen3.5 MoE expert's linear layers. Produces the naming pattern: experts.{id}.gate_proj.weight @@ -748,7 +748,7 @@ def __init__(self, hidden_dim: int, expert_dim: int): self.down_proj = nn.Linear(expert_dim, hidden_dim, bias=False) -class _QuantQwen3_5MoeExperts(QuantModule): +class _QuantQwen35MoeExperts(QuantModule): def _setup(self): """Modify the Qwen3_5MoeExperts by using per-expert nn.Module containers. @@ -769,7 +769,7 @@ def _copy_weight(module, weight): with init_empty_weights(): expert_modules = nn.ModuleList( [ - _Qwen3_5MoeExpertModule(self.hidden_dim, expert_dim) + _Qwen35MoeExpertModule(self.hidden_dim, expert_dim) for _ in range(self.num_experts) ] ) @@ -983,7 +983,7 @@ def unpack_weight(self): pass -class _QuantQwen3_5MoeSparseMoeBlock(_QuantSparseMoe): +class _QuantQwen35MoeSparseMoeBlock(_QuantSparseMoe): """Qwen3.5 MoE stores top_k/num_experts in the router (self.gate), not as direct attributes. We override forward instead of just bridging attributes because the router (self.gate) @@ -1012,12 +1012,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if Qwen3_5MoeSparseMoeBlock not in QuantModuleRegistry: QuantModuleRegistry.register({Qwen3_5MoeSparseMoeBlock: "hf.Qwen3_5MoeSparseMoeBlock"})( - _QuantQwen3_5MoeSparseMoeBlock + _QuantQwen35MoeSparseMoeBlock ) if Qwen3_5MoeExperts not in QuantModuleRegistry: QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})( - _QuantQwen3_5MoeExperts + _QuantQwen35MoeExperts ) except ImportError: pass From 811ef61cae8c1dba871ea723bdc40050adf074f7 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Mon, 16 Feb 2026 17:20:21 -0800 Subject: [PATCH 07/10] adopt *experts.{id}.* naming pattern Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/unified_export_hf.py | 27 ++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 7878d8ce7..e965d14ed 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -998,22 +998,31 @@ def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict: return state_dict -def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]: - """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors.""" +def _try_patch_module(mod_path: str) -> tuple[Any, Any] | None: + """Try to patch revert_weight_conversion in a single module.""" import importlib + try: + mod = importlib.import_module(mod_path) + if hasattr(mod, "revert_weight_conversion"): + original = getattr(mod, "revert_weight_conversion") + setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop) + return (mod, original) + except (ImportError, AttributeError): + pass + return None + + +def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]: + """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors.""" patches: list[tuple[Any, Any]] = [] for mod_path in [ "transformers.core_model_loading", "transformers.modeling_utils", ]: - try: - mod = importlib.import_module(mod_path) - if hasattr(mod, "revert_weight_conversion"): - patches.append((mod, getattr(mod, "revert_weight_conversion"))) - setattr(mod, "revert_weight_conversion", _revert_weight_conversion_noop) - except (ImportError, AttributeError): - pass + result = _try_patch_module(mod_path) + if result is not None: + patches.append(result) return patches From bd029e5703c3af842fd597651d44b852dfb84938 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 25 Feb 2026 22:29:06 -0800 Subject: [PATCH 08/10] address reviews Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/layer_utils.py | 21 ++++-------- .../torch/quantization/plugins/huggingface.py | 34 +++---------------- 2 files changed, 10 insertions(+), 45 deletions(-) diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index 820f94f39..acb5b9ac2 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -327,21 +327,12 @@ def is_mlp(module: nn.Module) -> bool: def is_moe(module: nn.Module) -> bool: """Returns whether the module is an MOE layer.""" - return any( - key in type(module).__name__.lower() - for key in [ - "MixtralSparseMoeBlock".lower(), - "ArcticMoE".lower(), - "DbrxFFN".lower(), - "MoELayer".lower(), - "PhimoeSparseMoeBlock".lower(), - "DeepseekMoE".lower(), - "Qwen2MoeSparseMoeBlock".lower(), - "Qwen3MoeSparseMoeBlock".lower(), - "Qwen3NextSparseMoeBlock".lower(), - "Qwen3_5MoeSparseMoeBlock".lower(), - ] - ) + name = type(module).__name__.lower() + # Auto-detect common MoE patterns + if name.endswith("sparsemoeblock") or "moelayer" in name: + return True + # Explicit matches for non-standard naming + return any(key in name for key in ["arcticmoe", "deepseekmoe", "dbrxffn"]) def is_quantlinear(module: nn.Module) -> bool: diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 4bc07996b..321b5a1ac 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -983,38 +983,12 @@ def unpack_weight(self): pass -class _QuantQwen35MoeSparseMoeBlock(_QuantSparseMoe): - """Qwen3.5 MoE stores top_k/num_experts in the router (self.gate), not as direct attributes. - - We override forward instead of just bridging attributes because the router (self.gate) - uses its own top_k internally for routing decisions. We must modify self.gate.top_k - directly so all experts see calibration data. - """ - - def _setup(self): - self.num_experts = self.experts.num_experts - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - if any(getattr(m, "_if_calib", False) for m in self.experts.modules()): - # Force all tokens to all experts during calibration - original_top_k = self.gate.top_k - self.gate.top_k = self.num_experts - super(_QuantSparseMoe, self).forward(hidden_states) - self.gate.top_k = original_top_k - return super(_QuantSparseMoe, self).forward(hidden_states) - - try: - from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import ( - Qwen3_5MoeExperts, - Qwen3_5MoeSparseMoeBlock, - ) - - if Qwen3_5MoeSparseMoeBlock not in QuantModuleRegistry: - QuantModuleRegistry.register({Qwen3_5MoeSparseMoeBlock: "hf.Qwen3_5MoeSparseMoeBlock"})( - _QuantQwen35MoeSparseMoeBlock - ) + from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import Qwen3_5MoeExperts + # Qwen3_5MoeSparseMoeBlock registration is handled by register_sparse_moe_on_the_fly + # (auto-detected via gate.top_k + gate.num_experts + experts pattern). + # Only the fused expert weights need explicit registration. if Qwen3_5MoeExperts not in QuantModuleRegistry: QuantModuleRegistry.register({Qwen3_5MoeExperts: "hf.Qwen3_5MoeExperts"})( _QuantQwen35MoeExperts From 86d35a5a7b3e0e5c2bf3b923718b6799c840bdc8 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 26 Feb 2026 00:33:01 -0800 Subject: [PATCH 09/10] minor Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/unified_export_hf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index e965d14ed..4e9e3ba32 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -993,6 +993,11 @@ def _export_diffusers_checkpoint( print(f"Export complete. Saved to: {export_dir}") +# TODO: Remove this workaround once HuggingFace fixes revert_weight_conversion to handle +# scalar (0-d) tensors. The bug is in transformers' Chunk.convert() which calls +# tensor.size(self.dim) on quantization scale buffers that are 0-d scalars, causing +# IndexError. Confirmed still present in transformers 5.2.0. +# See: transformers/core_model_loading.py, Chunk.convert() def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict: """No-op replacement for transformers' revert_weight_conversion.""" return state_dict From c7bb2916ac39d62ae9561eee3b46cf241b7fd476 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 26 Feb 2026 13:02:45 -0800 Subject: [PATCH 10/10] update doc Signed-off-by: Zhiyu Cheng --- CHANGELOG.rst | 1 + examples/llm_ptq/README.md | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5d6b61c21..e08058013 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,6 +16,7 @@ NVIDIA Model Optimizer Changelog (Linux) - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md `_ for usage. - Add support for rotating the input before quantization for RHT. - Add support for advanced weight scale search for NVFP4 quantization and its export path. +- Enable PTQ workflow for Qwen3.5 MoE models. 0.42 (2026-02-xx) ^^^^^^^^^^^^^^^^^ diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 187eed7f1..7a9a71f88 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -106,7 +106,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http | Llama-Nemotron Ultra | ✅ | ❌ | ❌ | ❌ | ❌ | | Gemma 3 | ✅2 | - | ✅ | - | - | | QWen 2, 2.5 4 | ✅ | ✅ | ✅ | ✅ | ✅ | -| QWen3 MOE, Next 6 | ✅ | - | - | - | ✅ | +| QWen3, 3.5 MOE, Next 6 | ✅ | - | - | - | ✅ | | QwQ | ✅ | - | - | - | ✅ | | DeepSeek V3, R1, V3.1, V3.27 | - | - | - | - | ✅ | | GLM-4.78 | ✅ | - | - | - | ✅ | @@ -402,6 +402,7 @@ print(llm_fp8.generate(["What's the age of the earth? "])) | QWen3 | FP4 | ✅ | ✅ | - | | QWen3 MoE | FP8 | ✅ | ✅ | ✅ | | QWen3 MoE | FP4 | ✅ | - | - | +| QWen3.5 MoE | FP4 | - | - | ✅ | | QWen2.5 | FP8 | ✅ | ✅ | ✅ | | QWen2.5 | FP4 | ✅ | ✅ | - | | QwQ-32B | FP8 | ✅ | ✅ | ✅ |