diff --git a/.gitignore b/.gitignore index 896b38a12..bf707424f 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,5 @@ save* .log *.pid *.ipynb* +.venv/ +*.sh \ No newline at end of file diff --git a/README.md b/README.md index ae17adb09..a6b368dae 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates **Docs**: [English](https://llmc-en.readthedocs.io/en/latest/), [Chinese](https://llmc-zhcn.readthedocs.io/en/latest/). +> **Recommended Python Version**: We recommend using **Python 3.11** for local development and installation. This matches the project's Docker images and CI configuration, and is generally more stable than Python 3.12 for the current dependency set. + ## :fire: Latest News - **Nov 9, 2025:** 🍺🍺🍺 Our work [**LLMC+: Benchmarking Vision-Language Model Compression with a Plug-and-play Toolkit**](https://arxiv.org/abs/2508.09981) has been accepted by AAAI 2026. diff --git a/README_zh.md b/README_zh.md index d67ee5f32..6523d0b0c 100644 --- a/README_zh.md +++ b/README_zh.md @@ -31,6 +31,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates **文档**: [English](https://llmc-en.readthedocs.io/en/latest/)、[中文](https://llmc-zhcn.readthedocs.io/en/latest/)。 +> **推荐 Python 版本**:建议本地开发和安装使用 **Python 3.11**。这与项目的 Docker 镜像和 CI 配置保持一致,并且对当前依赖集合而言通常比 Python 3.12 更稳定。 + ## :fire: 最新动态 - **2025年8月13日:** 🚀 我们已开源针对 **视觉语言模型(VLMs)** 的压缩方案,支持共计超过 **20 种算法**,涵盖 **token reduction** 和 **quantization**。此次发布为多模态任务提供了灵活、即插即用的压缩策略。具体请参阅[文档](https://llmc-zhcn.readthedocs.io/en/latest/advanced/token_reduction.html)。 diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml index f2bbda675..f10ff95e7 100644 --- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml +++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml @@ -1,24 +1,25 @@ base: seed: &seed 42 model: - type: model_type - path: model path + type: Qwen3 + path: /home/michael/Project/models/Qwen3-0.6B torch_dtype: auto calib: name: pileval download: False - path: calib data path + path: /home/michael/Project/calib/pileval + n_sample: 128 n_samples: 128 bs: 1 seq_len: 2048 preproc: txt_general_preproc seed: *seed eval: - eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos + eval_pos: [] #long_ppl eval not support pretrain eval pos name: wikitext2 type: decode_ppl download: False - path: eval_data_path + path: /home/michael/Project/llmc_datasets/wikitext2 bs: 1 inference_per_block: False num_samples: 10 @@ -41,5 +42,7 @@ quant: symmetric: True granularity: per_tensor save: + save_calib_json: True + calib_json_name: kv_cache_calib.json save_fake: False - save_path: /path/to/save/ + save_path: /home/michael/Project/llmc_save diff --git a/llmc/__main__.py b/llmc/__main__.py index ec60c1492..44e0232ef 100755 --- a/llmc/__main__.py +++ b/llmc/__main__.py @@ -26,20 +26,27 @@ def main(config): + # 从注册表拿模型并实例化 + # 动态分配模型 model = MODEL_REGISTRY[config.model.type](config) + # 打印模型和tokenizer logger.info(f'model: {model}') logger.info(f'tokenizer: {model.get_tokenizer()}') + # 获得需要的评测种类 eval_list = get_eval_list(model, config) + # 真正执行评测 eval_model(model, None, eval_list, eval_pos='pretrain') blockwise_opts = [] + # 取出处理模态 modalities, modality_configs = get_modality(config) for modality, modality_config in zip(modalities, modality_configs): model.set_modality(modality) if not config.get('calib', False): + # 不需要校准数据 直接构造算法对象 blockwise_opt = ALGO_REGISTRY[modality_config.method]( model, modality_config, @@ -51,14 +58,17 @@ def main(config): blockwise_opts.append(blockwise_opt) dist.barrier() else: + # 需要校准数据 dataset = BaseDataset( model.get_tokenizer(), config.calib, model.batch_process ) calib_data, padding_mask = dataset.get_calib_dataset() + # 收集第一层block输入 为后续blockwise算法需要的输入缓存下来 model.collect_first_block_input(calib_data, padding_mask) del calib_data gc.collect() torch.cuda.empty_cache() + # 构造算法对象 blockwise_opt = ALGO_REGISTRY[modality_config.method]( model, modality_config, @@ -66,15 +76,36 @@ def main(config): model.get_padding_mask(), config, ) + # 项目逐层block做优化 blockwise_opt.run_block_loop() blockwise_opts.append(blockwise_opt) dist.barrier() + # 对变化后的浮点模型做评测 eval_model(model, blockwise_opts, eval_list, eval_pos='transformed') + # 只有rank 0继续做保存和导出 if int(os.environ['RANK']) == 0: + if 'save' in config and config.save.get('save_calib_json', False): + # 收集各个模态/量化器导出的校准结果。 + calib_json_list = [ + blockwise_opt.collect_calib_json() + for blockwise_opt in blockwise_opts + if hasattr(blockwise_opt, 'collect_calib_json') + ] + # 单模态时保持扁平结构,兼容 LightLLM 的校准文件格式。 + calib_json_payload = ( + calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list + ) + # 将最终的校准 JSON 写入配置指定的输出路径。 + with open(save_calib_json_path, 'w') as file: + json.dump(calib_json_payload, file, ensure_ascii=False, indent=4) + logger.info(f'save calib json done -- {save_calib_json_path}') + + # 保存变换后的浮点模型 if 'save' in config and config.save.get('save_trans', False): blockwise_opt.save_model(save_trans_path) + # 保存TensorRT-LLM格式并构建engine if 'save' in config and config.save.get('save_trtllm', False): blockwise_opt.save_model(save_trtllm_trans_path) from llmc.utils.export_trtllm import cvt_trtllm_engine @@ -88,12 +119,15 @@ def main(config): eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant') eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant_wo_kv') + # 切换到fake quant部署模式再保存 if 'save' in config and config.save.get('save_fake', False): deploy_all_modality(blockwise_opts, 'fake_quant') blockwise_opt.save_model(save_fake_path) if 'save' in config: + # 导出真实量化模型给推理后端 if ( + # 导出前进行遍历检查 config.save.get('save_vllm', False) or config.save.get('save_sgl', False) or config.save.get('save_lightllm', False) @@ -101,9 +135,12 @@ def main(config): for modality_config in modality_configs: w, a = modality_config.weight, modality_config.get('act') + # 只允许特定bit类型 if isinstance(w.bit, str): + # 必须对称量化 assert w.symmetric, 'Only symmetric quant is supported.' assert w.bit in ['e4m3', 'e3m4'], 'Supported quant: w8a16.' + # 有激活量化的话,那激活也要满足对称、bit合法的要求 if a: assert ( w.symmetric and a.symmetric @@ -114,6 +151,7 @@ def main(config): and a.bit in ['e4m3', 'e5m2'] ), 'Only WA FP8 quant is supported' else: + # 是整数则必须是4 or 8 assert w.symmetric, 'Only symmetric quant is supported.' assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.' if a: @@ -130,12 +168,15 @@ def main(config): blockwise_opt.save_model(save_quant_path) update_vllm_quant_config(blockwise_opt.model, config, save_quant_path) + # 给特定后端(AutoAWQ导出 elif config.save.get('save_autoawq', False): for modality_config in modality_configs: + # 只能4 bit 仅含有weight 不支持act assert ( modality_config.weight.bit in [4] and 'act' not in modality_config ), 'AutoAWQ supports only 4-bit weight-only quantization.' assert ( + # 不能对称量化 not modality_config.weight.symmetric ), 'Only asymmetric quant is supported.' @@ -161,11 +202,15 @@ def main(config): blockwise_opt.save_model(save_quant_path) update_lightx2v_quant_config(save_quant_path) + # 判断是否有opencompass if 'opencompass' in config: assert config.save.get('save_trans', False) + # 从配置里读取cfg_path, output_path cfg_path = config['opencompass']['cfg_path'] output_path = config['opencompass']['output_path'] + # 取路径 eval_model_path = os.path.abspath(save_trans_path) + # 拼指令 opencompass_cmd = ( f'opencompass {cfg_path} -w {output_path} ' f'--llmc_cfg {args.config} ' @@ -173,6 +218,7 @@ def main(config): f'--llmc_model_path {eval_model_path}' ) logger.info(f'opencompass_cmd : {opencompass_cmd}') + # 执行 os.system(opencompass_cmd) dist.barrier() @@ -181,20 +227,25 @@ def main(config): logger.add(sys.stdout, level='INFO') llmc_start_time = time.time() parser = argparse.ArgumentParser() + # 解析命令行参数 parser.add_argument('--config', type=str, required=True) parser.add_argument('--task_id', type=str, required=True) args = parser.parse_args() with open(args.config, 'r') as file: + # 读取配置文件 config = yaml.safe_load(file) config = EasyDict(config) init_process_group(backend='nccl') + # 初始化分布式环境 设置GPU torch.cuda.set_device(int(os.environ['LOCAL_RANK'])) + # 检查配置 打印依赖版本 if int(os.environ['RANK']) != 0: logger.remove() + # 检查配置是否合法 check_config(config) logger.info(f'args: {args}') @@ -209,6 +260,12 @@ def main(config): # Ensure only the main process creates directories if int(os.environ['RANK']) == 0: if 'save' in config: + if config.save.get('save_calib_json', False): + mkdirs(config.save.save_path) + save_calib_json_path = os.path.join( + config.save.save_path, + config.save.get('calib_json_name', 'calib_scales.json'), + ) if config.save.get('save_trans', False): save_trans_path = os.path.join( config.save.save_path, 'transformed_model' @@ -266,3 +323,4 @@ def main(config): llmc_duration_time = llmc_end_time - llmc_start_time logger.info(f'llmc_duration_time: {llmc_duration_time} s') logger.info('--- llmc finished ---') + diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py index 5a2232699..3240fda7e 100755 --- a/llmc/compression/quantization/base_blockwise_quantization.py +++ b/llmc/compression/quantization/base_blockwise_quantization.py @@ -175,13 +175,17 @@ def set_quant_config(self): self.act_quant_module = IntegerQuantizer elif quant_type == 'float-quant': self.act_quant_module = FloatQuantizer - self.quant_config['act']['tp'] = self.tp - self.aquantizer = self.act_quant_module(**self.quant_config['act']) self.act_static = self.quant_config['act'].get('static', False) if self.act_static: assert ( self.quant_config['act']['granularity'] == 'per_tensor' ), 'Only support per_tensor static quant' + # 静态激活量化会走批量校准接口,这里把默认的 minmax + # 归一化成对应的 static_minmax,避免后续校准时报算法名不匹配。 + if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax': + self.quant_config['act']['calib_algo'] = 'static_minmax' + self.quant_config['act']['tp'] = self.tp + self.aquantizer = self.act_quant_module(**self.quant_config['act']) self.quant_attn = self.quant_config['act'].get('quant_attn', False) if self.quant_attn: assert self.config['model']['type'] in ['Vit', 'DeepseekV2'] @@ -203,8 +207,10 @@ def set_quant_config(self): kv_special_cfg = self.quant_config['kvcache'].get('special', {}) act_static_cfg = {} if self.act_static: - act_static_cfg.update(self.config.calib.n_sample) - act_static_cfg.update(self.config.calib.bs) + # KV cache 构造函数接收的是 num_samples / bsz, + # 这里把校准配置里的字段名映射成它实际需要的参数名。 + act_static_cfg['num_samples'] = self.config.calib.n_sample + act_static_cfg['bsz'] = self.config.calib.bs kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant') self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']]( kv_quant_type, self.quant_config['kvcache'], @@ -1003,6 +1009,111 @@ def contiguous_params(self): if not param.is_contiguous(): param.data = param.data.contiguous() + # 将张量等对象转换成 JSON 可直接写出的 Python 基础类型。 + def _to_jsonable(self, value): + if isinstance(value, torch.Tensor): + return value.detach().cpu().tolist() + return value + + # 统一把输入规整成 CPU tensor,便于后续做范围计算和序列化。 + def _to_tensor(self, value, dtype=torch.float32): + if isinstance(value, torch.Tensor): + return value.detach().cpu().to(dtype) + return torch.as_tensor(value, dtype=dtype) + + # LightLLM 需要的是离线 FP8 KV 的 descale,这里先根据 qparams 还原实数范围, + # 再换算成与 torch.float8_e4m3fn 对齐的每层 K/V scale。 + def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax): + if isinstance(scales, torch.Tensor) and scales.numel() == 0: + return None + + scales_tensor = self._to_tensor(scales) + zeros_tensor = self._to_tensor(zeros, dtype=scales_tensor.dtype) + qmin_tensor = self._to_tensor(qmin, dtype=scales_tensor.dtype) + qmax_tensor = self._to_tensor(qmax, dtype=scales_tensor.dtype) + min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor + max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor + absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs()) + fp8_qmax = torch.tensor( + torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype + ) + return absmax_tensor / fp8_qmax + + # 按 LightLLM 的 kv_cache_calib.json 结构导出校准结果, + # 目前只支持它已经接入的 per_tensor / per_head 两种 KV 格式。 + def collect_calib_json(self): + if not getattr(self, 'quant_kvcache', False): + raise ValueError('save_calib_json requires kvcache quantization.') + + kv_cfg = self.quant_config['kvcache'] + granularity = kv_cfg.get('granularity') + # LightLLM 当前只识别 per_tensor 和 per_head 两种静态 KV 校准文件。 + if granularity not in ['per_tensor', 'per_head']: + raise ValueError( + f'LightLLM calib export only supports per_tensor/per_head, got {granularity}' + ) + + num_layers = self.model.model_config.num_hidden_layers + # LightLLM 会校验 KV head 数;如果模型配置里没有这个字段,再退回总 head 数。 + num_head = int( + getattr( + self.model.model_config, + 'num_key_value_heads', + self.model.get_num_attention_heads(), + ) + ) + scales = [] + # 每层导出一行,顺序固定为 [k_scale..., v_scale...]。 + for layer_idx in range(num_layers): + key_scale = self._collect_lightllm_kv_scale( + self.kv_module.k_scales_buffer[layer_idx], + self.kv_module.k_zeros_buffer[layer_idx], + self.kv_module.k_qmin_buffer[layer_idx], + self.kv_module.k_qmax_buffer[layer_idx], + ) + value_scale = self._collect_lightllm_kv_scale( + self.kv_module.v_scales_buffer[layer_idx], + self.kv_module.v_zeros_buffer[layer_idx], + self.kv_module.v_qmin_buffer[layer_idx], + self.kv_module.v_qmax_buffer[layer_idx], + ) + if key_scale is None or value_scale is None: + raise ValueError(f'Calibration scale for layer {layer_idx} is empty.') + + scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist() + scales.append(scale_row) + + scale_width = len(scales[0]) if scales else 0 + # per_tensor 每层只能有 [k_scale, v_scale] 两个值; + # per_head 则需要每层 2 * num_head 个值。 + if granularity == 'per_tensor' and scale_width != 2: + raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}') + if granularity == 'per_head' and scale_width != num_head * 2: + raise ValueError( + f'per_head export expects {num_head * 2} scales per layer, got {scale_width}' + ) + + # 优先复用 Hugging Face config 里的 architectures 字段, + # 缺失时退回到 LLMC 配置里的模型类型,便于 LightLLM 做架构一致性校验。 + architectures = getattr(self.model.model_config, 'architectures', None) + if isinstance(architectures, list) and len(architectures) > 0: + architectures = architectures[0] + elif architectures is None: + architectures = self.config.model.type + + # 顶层字段名称和含义对齐 LightLLM PR #1220 中的 kv_cache_calib.json。 + return { + 'version': '1.0', + 'architectures': architectures, + 'quant_type': granularity, + 'qmin': float(torch.finfo(torch.float8_e4m3fn).min), + 'qmax': float(torch.finfo(torch.float8_e4m3fn).max), + 'num_layers': num_layers, + 'num_head': num_head, + 'scales_shape': [num_layers, scale_width], + 'scales': scales, + } + @torch.no_grad() def save_model(self, path): if int(os.environ['RANK']) != 0: diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py index 32c2de5be..6cbe75ffb 100644 --- a/llmc/compression/quantization/kvquant.py +++ b/llmc/compression/quantization/kvquant.py @@ -1,3 +1,4 @@ +import copy import torch from loguru import logger from transformers import DynamicCache @@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache): def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1): super().__init__() + # 复制一份配置,避免在静态 KV 校准场景下修改原始量化配置对象。 + kvquant_cfg = copy.deepcopy(kvquant_cfg) assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group'] self.num_hidden_layers, self.num_samples, self.bsz = ( num_hidden_layers, num_samples, bsz, ) + if kvquant_cfg.get('static', False) and kvquant_cfg.get( + 'calib_algo', 'minmax' + ) == 'minmax': + # 静态 KV 校准会走批量张量统计接口,这里把默认的 minmax + # 归一化成对应的 static_minmax,避免后续校准时报算法名不匹配。 + kvquant_cfg['calib_algo'] = 'static_minmax' if quant_type == 'int-quant': self.kvquantizer = IntegerQuantizer(**kvquant_cfg) elif quant_type == 'float-quant':