Skip to content

Commit 408a574

Browse files
Add parameter best auto-pruning for Minitron
Signed-off-by: Keval Morabia <[email protected]>
1 parent 3350b0a commit 408a574

File tree

13 files changed

+386
-148
lines changed

13 files changed

+386
-148
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ NVIDIA Model Optimizer Changelog (Linux)
1414
- Add support for parallel draft heads in Eagle speculative decoding.
1515
- Add support to enable custom emulated quantization backend. See :meth:`register_quant_backend <modelopt.torch.quantization.nn.modules.tensor_quantizer.register_quant_backend>`` for more details. See an example in ``tests/unit/torch/quantization/test_custom_backend.py``.
1616
- Add ``examples/llm_qad`` for QAD training with Megatron-LM.
17+
- Add support for ``params`` constraint based automatic neural architecture search in Minitron pruning (``mcore_minitron``) as an alternative to manual pruning using ``export_config``. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning>`_ for more details on its usage.
1718

1819
**Deprecations**
1920

@@ -80,7 +81,7 @@ NVIDIA Model Optimizer Changelog (Linux)
8081

8182
**Documentation**
8283

83-
- Add general guidelines for Minitron pruning and distillation. See `examples/pruning/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
84+
- Add general guidelines for Minitron pruning and distillation. See `pruning guidelines <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/pruning#pruning-guidelines>`_ for more details.
8485
- Added example for exporting QLoRA checkpoint for vLLM deployment. Refer to `examples/llm_qat/README.md <https://github.com/NVIDIA/Model-Optimizer/blob/79ef31bc7269ba4da0cfab446da5b64509cbfcef/examples/llm_qat/README.md#qlora-deployment>`_ for more details
8586

8687
0.37 (2025-10-08)

modelopt/torch/nas/plugins/megatron.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727
from megatron.core.models.gpt import GPTModel
2828
from megatron.core.parallel_state import (
2929
get_data_parallel_group,
30-
get_pipeline_model_parallel_group,
31-
get_tensor_model_parallel_group,
3230
is_pipeline_first_stage,
3331
is_pipeline_last_stage,
3432
)
@@ -54,13 +52,8 @@
5452
from modelopt.torch.opt.searcher import ConstraintsDict
5553
from modelopt.torch.trace import Symbol
5654
from modelopt.torch.utils import distributed as dist
57-
from modelopt.torch.utils import (
58-
get_module_device,
59-
make_divisible,
60-
param_num_from_forward,
61-
print_rank_0,
62-
random,
63-
)
55+
from modelopt.torch.utils import make_divisible, print_rank_0, random
56+
from modelopt.torch.utils.plugins import param_num_megatron
6457

6558
from ..algorithms import (
6659
MODULE_TYPE_TO_CONSTRAINTS_FUNC,
@@ -1045,7 +1038,6 @@ def modify(
10451038
*,
10461039
hidden_size_divisor: int = 1,
10471040
ffn_hidden_size_divisor: int = 1,
1048-
mamba_num_heads_divisor: int = 1,
10491041
mamba_head_dim_divisor: int = 1,
10501042
num_moe_experts_divisor: int = 1,
10511043
):
@@ -1054,7 +1046,6 @@ def modify(
10541046
Args:
10551047
hidden_size_divisor: The divisor of the hidden_size.
10561048
ffn_hidden_size_divisor: The divisor of the mlp ffn_hidden_size.
1057-
mamba_num_heads_divisor: The divisor of the mamba num_heads.
10581049
mamba_head_dim_divisor: The divisor of the mamba head_dim.
10591050
num_moe_experts_divisor: The divisor of the number of MoE experts.
10601051
"""
@@ -1065,7 +1056,6 @@ def modify(
10651056
for layer in self.decoder.layers:
10661057
layer.modify(
10671058
ffn_hidden_size_divisor=ffn_hidden_size_divisor,
1068-
mamba_num_heads_divisor=mamba_num_heads_divisor,
10691059
mamba_head_dim_divisor=mamba_head_dim_divisor,
10701060
num_moe_experts_divisor=num_moe_experts_divisor,
10711061
)
@@ -1142,11 +1132,7 @@ def constraint_eval_funcs(self) -> dict[str, ConstraintEvalFunc]:
11421132

11431133
def _get_params(self, _: ConstraintsRes | None = None) -> float:
11441134
"""Get number of model parameters from forward pass."""
1145-
params = param_num_from_forward(self.model, args=self.dummy_input, unit=1.0)
1146-
reduced_params = torch.Tensor([params]).to(device=get_module_device(self.model))
1147-
torch.distributed.all_reduce(reduced_params, group=get_pipeline_model_parallel_group())
1148-
torch.distributed.all_reduce(reduced_params, group=get_tensor_model_parallel_group())
1149-
return reduced_params.item()
1135+
return param_num_megatron(self.model, from_forward=True, args=self.dummy_input)
11501136

11511137
def _get_flops(self, _: ConstraintsRes | None = None) -> float:
11521138
"""Get inference FLOPs."""

modelopt/torch/opt/searcher.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
import torch.nn as nn
3636

3737
from modelopt.torch.utils import distributed as dist
38-
from modelopt.torch.utils import no_stdout, run_forward_loop
38+
from modelopt.torch.utils import no_stdout, print_rank_0, run_forward_loop
3939

4040
LimitsTuple = tuple[float, float]
4141
ConstraintsDict = dict[str, str | float | dict | None]
@@ -212,6 +212,7 @@ def construct_forward_loop(
212212
return None
213213

214214
def forward_loop_with_silence_check(m: nn.Module) -> None:
215+
print_rank_0("Running forward loop...")
215216
with no_stdout() if silent else nullcontext():
216217
if data_loader is not None:
217218
run_forward_loop(

modelopt/torch/prune/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
simplifies the overall workflow to accommodate for the simpler nature of pruning algorithms.
2020
"""
2121

22-
# nas is a required - so let's check if it's available
23-
import modelopt.torch.nas
2422
from modelopt.torch.utils import import_plugin
2523

2624
from . import fastnas, gradnas, plugins

0 commit comments

Comments
 (0)