Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ save*
.log
*.pid
*.ipynb*
.venv/
*.sh
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates

**Docs**: [English](https://llmc-en.readthedocs.io/en/latest/), [Chinese](https://llmc-zhcn.readthedocs.io/en/latest/).

> **Recommended Python Version**: We recommend using **Python 3.11** for local development and installation. This matches the project's Docker images and CI configuration, and is generally more stable than Python 3.12 for the current dependency set.

## :fire: Latest News

- **Nov 9, 2025:** 🍺🍺🍺 Our work [**LLMC+: Benchmarking Vision-Language Model Compression with a Plug-and-play Toolkit**](https://arxiv.org/abs/2508.09981) has been accepted by AAAI 2026.
Expand Down
2 changes: 2 additions & 0 deletions README_zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates

**文档**: [English](https://llmc-en.readthedocs.io/en/latest/)、[中文](https://llmc-zhcn.readthedocs.io/en/latest/)。

> **推荐 Python 版本**:建议本地开发和安装使用 **Python 3.11**。这与项目的 Docker 镜像和 CI 配置保持一致,并且对当前依赖集合而言通常比 Python 3.12 更稳定。

## :fire: 最新动态

- **2025年8月13日:** 🚀 我们已开源针对 **视觉语言模型(VLMs)** 的压缩方案,支持共计超过 **20 种算法**,涵盖 **token reduction** 和 **quantization**。此次发布为多模态任务提供了灵活、即插即用的压缩策略。具体请参阅[文档](https://llmc-zhcn.readthedocs.io/en/latest/advanced/token_reduction.html)。
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
base:
seed: &seed 42
model:
type: model_type
path: model path
type: Qwen3
path: /home/michael/Project/models/Qwen3-0.6B
torch_dtype: auto
calib:
name: pileval
download: False
path: calib data path
path: /home/michael/Project/calib/pileval
n_sample: 128
n_samples: 128
bs: 1
seq_len: 2048
preproc: txt_general_preproc
seed: *seed
eval:
eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos
eval_pos: [] #long_ppl eval not support pretrain eval pos
name: wikitext2
type: decode_ppl
download: False
path: eval_data_path
path: /home/michael/Project/llmc_datasets/wikitext2
bs: 1
inference_per_block: False
num_samples: 10
Expand All @@ -41,5 +42,7 @@ quant:
symmetric: True
granularity: per_tensor
save:
save_calib_json: True
calib_json_name: kv_cache_calib.json
save_fake: False
save_path: /path/to/save/
save_path: /home/michael/Project/llmc_save
58 changes: 58 additions & 0 deletions llmc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,27 @@


def main(config):
# 从注册表拿模型并实例化
# 动态分配模型
model = MODEL_REGISTRY[config.model.type](config)

# 打印模型和tokenizer
logger.info(f'model: {model}')
logger.info(f'tokenizer: {model.get_tokenizer()}')

# 获得需要的评测种类
eval_list = get_eval_list(model, config)
# 真正执行评测
eval_model(model, None, eval_list, eval_pos='pretrain')

blockwise_opts = []
# 取出处理模态
modalities, modality_configs = get_modality(config)

for modality, modality_config in zip(modalities, modality_configs):
model.set_modality(modality)
if not config.get('calib', False):
# 不需要校准数据 直接构造算法对象
blockwise_opt = ALGO_REGISTRY[modality_config.method](
model,
modality_config,
Expand All @@ -51,30 +58,54 @@ def main(config):
blockwise_opts.append(blockwise_opt)
dist.barrier()
else:
# 需要校准数据
dataset = BaseDataset(
model.get_tokenizer(), config.calib, model.batch_process
)
calib_data, padding_mask = dataset.get_calib_dataset()
# 收集第一层block输入 为后续blockwise算法需要的输入缓存下来
model.collect_first_block_input(calib_data, padding_mask)
del calib_data
gc.collect()
torch.cuda.empty_cache()
# 构造算法对象
blockwise_opt = ALGO_REGISTRY[modality_config.method](
model,
modality_config,
model.get_first_block_input(),
model.get_padding_mask(),
config,
)
# 项目逐层block做优化
blockwise_opt.run_block_loop()
blockwise_opts.append(blockwise_opt)
dist.barrier()

# 对变化后的浮点模型做评测
eval_model(model, blockwise_opts, eval_list, eval_pos='transformed')
# 只有rank 0继续做保存和导出
if int(os.environ['RANK']) == 0:
if 'save' in config and config.save.get('save_calib_json', False):
# 收集各个模态/量化器导出的校准结果。
calib_json_list = [
blockwise_opt.collect_calib_json()
for blockwise_opt in blockwise_opts
if hasattr(blockwise_opt, 'collect_calib_json')
]
# 单模态时保持扁平结构,兼容 LightLLM 的校准文件格式。
calib_json_payload = (
calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list
)
# 将最终的校准 JSON 写入配置指定的输出路径。
with open(save_calib_json_path, 'w') as file:
json.dump(calib_json_payload, file, ensure_ascii=False, indent=4)
logger.info(f'save calib json done -- {save_calib_json_path}')

# 保存变换后的浮点模型
if 'save' in config and config.save.get('save_trans', False):
blockwise_opt.save_model(save_trans_path)

# 保存TensorRT-LLM格式并构建engine
if 'save' in config and config.save.get('save_trtllm', False):
blockwise_opt.save_model(save_trtllm_trans_path)
from llmc.utils.export_trtllm import cvt_trtllm_engine
Expand All @@ -88,22 +119,28 @@ def main(config):
eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant')
eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant_wo_kv')

# 切换到fake quant部署模式再保存
if 'save' in config and config.save.get('save_fake', False):
deploy_all_modality(blockwise_opts, 'fake_quant')
blockwise_opt.save_model(save_fake_path)

if 'save' in config:
# 导出真实量化模型给推理后端
if (
# 导出前进行遍历检查
config.save.get('save_vllm', False)
or config.save.get('save_sgl', False)
or config.save.get('save_lightllm', False)
):
for modality_config in modality_configs:
w, a = modality_config.weight, modality_config.get('act')

# 只允许特定bit类型
if isinstance(w.bit, str):
# 必须对称量化
assert w.symmetric, 'Only symmetric quant is supported.'
assert w.bit in ['e4m3', 'e3m4'], 'Supported quant: w8a16.'
# 有激活量化的话,那激活也要满足对称、bit合法的要求
if a:
assert (
w.symmetric and a.symmetric
Expand All @@ -114,6 +151,7 @@ def main(config):
and a.bit in ['e4m3', 'e5m2']
), 'Only WA FP8 quant is supported'
else:
# 是整数则必须是4 or 8
assert w.symmetric, 'Only symmetric quant is supported.'
assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
if a:
Expand All @@ -130,12 +168,15 @@ def main(config):
blockwise_opt.save_model(save_quant_path)
update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)

# 给特定后端(AutoAWQ导出
elif config.save.get('save_autoawq', False):
for modality_config in modality_configs:
# 只能4 bit 仅含有weight 不支持act
assert (
modality_config.weight.bit in [4] and 'act' not in modality_config
), 'AutoAWQ supports only 4-bit weight-only quantization.'
assert (
# 不能对称量化
not modality_config.weight.symmetric
), 'Only asymmetric quant is supported.'

Expand All @@ -161,18 +202,23 @@ def main(config):
blockwise_opt.save_model(save_quant_path)
update_lightx2v_quant_config(save_quant_path)

# 判断是否有opencompass
if 'opencompass' in config:
assert config.save.get('save_trans', False)
# 从配置里读取cfg_path, output_path
cfg_path = config['opencompass']['cfg_path']
output_path = config['opencompass']['output_path']
# 取路径
eval_model_path = os.path.abspath(save_trans_path)
# 拼指令
opencompass_cmd = (
f'opencompass {cfg_path} -w {output_path} '
f'--llmc_cfg {args.config} '
f'--llmc_eval_mode quant '
f'--llmc_model_path {eval_model_path}'
)
logger.info(f'opencompass_cmd : {opencompass_cmd}')
# 执行
os.system(opencompass_cmd)
dist.barrier()

Expand All @@ -181,20 +227,25 @@ def main(config):
logger.add(sys.stdout, level='INFO')
llmc_start_time = time.time()
parser = argparse.ArgumentParser()
# 解析命令行参数
parser.add_argument('--config', type=str, required=True)
parser.add_argument('--task_id', type=str, required=True)
args = parser.parse_args()

with open(args.config, 'r') as file:
# 读取配置文件
config = yaml.safe_load(file)
config = EasyDict(config)

init_process_group(backend='nccl')
# 初始化分布式环境 设置GPU
torch.cuda.set_device(int(os.environ['LOCAL_RANK']))

# 检查配置 打印依赖版本
if int(os.environ['RANK']) != 0:
logger.remove()

# 检查配置是否合法
check_config(config)

logger.info(f'args: {args}')
Expand All @@ -209,6 +260,12 @@ def main(config):
# Ensure only the main process creates directories
if int(os.environ['RANK']) == 0:
if 'save' in config:
if config.save.get('save_calib_json', False):
mkdirs(config.save.save_path)
save_calib_json_path = os.path.join(
config.save.save_path,
config.save.get('calib_json_name', 'calib_scales.json'),
)
if config.save.get('save_trans', False):
save_trans_path = os.path.join(
config.save.save_path, 'transformed_model'
Expand Down Expand Up @@ -266,3 +323,4 @@ def main(config):
llmc_duration_time = llmc_end_time - llmc_start_time
logger.info(f'llmc_duration_time: {llmc_duration_time} s')
logger.info('--- llmc finished ---')

Loading
Loading