diff --git a/.gitignore b/.gitignore
index 896b38a12..bf707424f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,5 @@ save*
 .log
 *.pid
 *.ipynb*
+.venv/
+*.sh
\ No newline at end of file
diff --git a/README.md b/README.md
index ae17adb09..a6b368dae 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 **Docs**: [English](https://llmc-en.readthedocs.io/en/latest/), [Chinese](https://llmc-zhcn.readthedocs.io/en/latest/).
 
+> **Recommended Python Version**: We recommend using **Python 3.11** for local development and installation. This matches the project's Docker images and CI configuration, and is generally more stable than Python 3.12 for the current dependency set.
+
 ## :fire: Latest News
 
 - **Nov 9, 2025:** 🍺🍺🍺 Our work [**LLMC+: Benchmarking Vision-Language Model Compression with a Plug-and-play Toolkit**](https://arxiv.org/abs/2508.09981) has been accepted by AAAI 2026.
diff --git a/README_zh.md b/README_zh.md
index d67ee5f32..6523d0b0c 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -31,6 +31,8 @@ docker pull registry.cn-hangzhou.aliyuncs.com/yongyang/llmcompression:pure-lates
 
 **文档**： [English](https://llmc-en.readthedocs.io/en/latest/)、[中文](https://llmc-zhcn.readthedocs.io/en/latest/)。
 
+> **推荐 Python 版本**：建议本地开发和安装使用 **Python 3.11**。这与项目的 Docker 镜像和 CI 配置保持一致，并且对当前依赖集合而言通常比 Python 3.12 更稳定。
+
 ## :fire: 最新动态
 
 - **2025年8月13日:** 🚀 我们已开源针对 **视觉语言模型（VLMs）** 的压缩方案，支持共计超过 **20 种算法**，涵盖 **token reduction** 和 **quantization**。此次发布为多模态任务提供了灵活、即插即用的压缩策略。具体请参阅[文档](https://llmc-zhcn.readthedocs.io/en/latest/advanced/token_reduction.html)。
diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml
index f2bbda675..f10ff95e7 100644
--- a/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml
+++ b/configs/quantization/methods/KVQuant/rtn_w_a_pertensor_static_naive_quant_kv.yml
@@ -1,24 +1,25 @@
 base:
     seed: &seed 42
 model:
-    type: model_type
-    path: model path
+    type: Qwen3
+    path: /home/michael/Project/models/Qwen3-0.6B
     torch_dtype: auto
 calib:
     name: pileval
     download: False
-    path: calib data path
+    path: /home/michael/Project/calib/pileval
+    n_sample: 128
     n_samples: 128
     bs: 1
     seq_len: 2048
     preproc: txt_general_preproc
     seed: *seed
 eval:
-    eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos
+    eval_pos: [] #long_ppl eval not support pretrain eval pos
     name: wikitext2
     type: decode_ppl
     download: False
-    path: eval_data_path
+    path: /home/michael/Project/llmc_datasets/wikitext2
     bs: 1
     inference_per_block: False
     num_samples: 10
@@ -41,5 +42,7 @@ quant:
         symmetric: True
         granularity: per_tensor
 save:
+    save_calib_json: True
+    calib_json_name: kv_cache_calib.json
     save_fake: False
-    save_path: /path/to/save/
+    save_path: /home/michael/Project/llmc_save
diff --git a/llmc/__main__.py b/llmc/__main__.py
index ec60c1492..44e0232ef 100755
--- a/llmc/__main__.py
+++ b/llmc/__main__.py
@@ -26,20 +26,27 @@
 
 
 def main(config):
+    # 从注册表拿模型并实例化
+    # 动态分配模型
     model = MODEL_REGISTRY[config.model.type](config)
 
+    # 打印模型和tokenizer
     logger.info(f'model: {model}')
     logger.info(f'tokenizer: {model.get_tokenizer()}')
 
+    # 获得需要的评测种类
     eval_list = get_eval_list(model, config)
+    # 真正执行评测
     eval_model(model, None, eval_list, eval_pos='pretrain')
 
     blockwise_opts = []
+    # 取出处理模态
     modalities, modality_configs = get_modality(config)
 
     for modality, modality_config in zip(modalities, modality_configs):
         model.set_modality(modality)
         if not config.get('calib', False):
+            # 不需要校准数据 直接构造算法对象
             blockwise_opt = ALGO_REGISTRY[modality_config.method](
                 model,
                 modality_config,
@@ -51,14 +58,17 @@ def main(config):
             blockwise_opts.append(blockwise_opt)
             dist.barrier()
         else:
+            # 需要校准数据
             dataset = BaseDataset(
                 model.get_tokenizer(), config.calib, model.batch_process
             )
             calib_data, padding_mask = dataset.get_calib_dataset()
+            # 收集第一层block输入 为后续blockwise算法需要的输入缓存下来
             model.collect_first_block_input(calib_data, padding_mask)
             del calib_data
             gc.collect()
             torch.cuda.empty_cache()
+            # 构造算法对象
             blockwise_opt = ALGO_REGISTRY[modality_config.method](
                 model,
                 modality_config,
@@ -66,15 +76,36 @@ def main(config):
                 model.get_padding_mask(),
                 config,
             )
+            # 项目逐层block做优化
             blockwise_opt.run_block_loop()
             blockwise_opts.append(blockwise_opt)
             dist.barrier()
 
+    # 对变化后的浮点模型做评测
     eval_model(model, blockwise_opts, eval_list, eval_pos='transformed')
+    # 只有rank 0继续做保存和导出
     if int(os.environ['RANK']) == 0:
+        if 'save' in config and config.save.get('save_calib_json', False):
+            # 收集各个模态/量化器导出的校准结果。
+            calib_json_list = [
+                blockwise_opt.collect_calib_json()
+                for blockwise_opt in blockwise_opts
+                if hasattr(blockwise_opt, 'collect_calib_json')
+            ]
+            # 单模态时保持扁平结构，兼容 LightLLM 的校准文件格式。
+            calib_json_payload = (
+                calib_json_list[0] if len(calib_json_list) == 1 else calib_json_list
+            )
+            # 将最终的校准 JSON 写入配置指定的输出路径。
+            with open(save_calib_json_path, 'w') as file:
+                json.dump(calib_json_payload, file, ensure_ascii=False, indent=4)
+            logger.info(f'save calib json done -- {save_calib_json_path}')
+
+        # 保存变换后的浮点模型
         if 'save' in config and config.save.get('save_trans', False):
             blockwise_opt.save_model(save_trans_path)
 
+        # 保存TensorRT-LLM格式并构建engine
         if 'save' in config and config.save.get('save_trtllm', False):
             blockwise_opt.save_model(save_trtllm_trans_path)
             from llmc.utils.export_trtllm import cvt_trtllm_engine
@@ -88,12 +119,15 @@ def main(config):
         eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant')
         eval_model(model, blockwise_opts, eval_list, eval_pos='fake_quant_wo_kv')
 
+        # 切换到fake quant部署模式再保存
         if 'save' in config and config.save.get('save_fake', False):
             deploy_all_modality(blockwise_opts, 'fake_quant')
             blockwise_opt.save_model(save_fake_path)
 
         if 'save' in config:
+            # 导出真实量化模型给推理后端
             if (
+                # 导出前进行遍历检查
                 config.save.get('save_vllm', False)
                 or config.save.get('save_sgl', False)
                 or config.save.get('save_lightllm', False)
@@ -101,9 +135,12 @@ def main(config):
                 for modality_config in modality_configs:
                     w, a = modality_config.weight, modality_config.get('act')
 
+                    # 只允许特定bit类型
                     if isinstance(w.bit, str):
+                        # 必须对称量化
                         assert w.symmetric, 'Only symmetric quant is supported.'
                         assert w.bit in ['e4m3', 'e3m4'], 'Supported quant: w8a16.'
+                        # 有激活量化的话，那激活也要满足对称、bit合法的要求
                         if a:
                             assert (
                                 w.symmetric and a.symmetric
@@ -114,6 +151,7 @@ def main(config):
                                 and a.bit in ['e4m3', 'e5m2']
                             ), 'Only WA FP8 quant is supported'
                     else:
+                        # 是整数则必须是4 or 8
                         assert w.symmetric, 'Only symmetric quant is supported.'
                         assert w.bit in [4, 8], 'Supported quant: w4a16, w8a16, w8a8.'
                         if a:
@@ -130,12 +168,15 @@ def main(config):
                 blockwise_opt.save_model(save_quant_path)
                 update_vllm_quant_config(blockwise_opt.model, config, save_quant_path)
 
+            # 给特定后端（AutoAWQ导出
             elif config.save.get('save_autoawq', False):
                 for modality_config in modality_configs:
+                    # 只能4 bit 仅含有weight 不支持act
                     assert (
                         modality_config.weight.bit in [4] and 'act' not in modality_config
                     ), 'AutoAWQ supports only 4-bit weight-only quantization.'
                     assert (
+                    # 不能对称量化
                         not modality_config.weight.symmetric
                     ), 'Only asymmetric quant is supported.'
 
@@ -161,11 +202,15 @@ def main(config):
                 blockwise_opt.save_model(save_quant_path)
                 update_lightx2v_quant_config(save_quant_path)
 
+        # 判断是否有opencompass
         if 'opencompass' in config:
             assert config.save.get('save_trans', False)
+            # 从配置里读取cfg_path, output_path
             cfg_path = config['opencompass']['cfg_path']
             output_path = config['opencompass']['output_path']
+            # 取路径
             eval_model_path = os.path.abspath(save_trans_path)
+            # 拼指令
             opencompass_cmd = (
                 f'opencompass {cfg_path} -w {output_path} '
                 f'--llmc_cfg {args.config} '
@@ -173,6 +218,7 @@ def main(config):
                 f'--llmc_model_path {eval_model_path}'
             )
             logger.info(f'opencompass_cmd : {opencompass_cmd}')
+            # 执行
             os.system(opencompass_cmd)
     dist.barrier()
 
@@ -181,20 +227,25 @@ def main(config):
     logger.add(sys.stdout, level='INFO')
     llmc_start_time = time.time()
     parser = argparse.ArgumentParser()
+    # 解析命令行参数
     parser.add_argument('--config', type=str, required=True)
     parser.add_argument('--task_id', type=str, required=True)
     args = parser.parse_args()
 
     with open(args.config, 'r') as file:
+        # 读取配置文件
         config = yaml.safe_load(file)
     config = EasyDict(config)
 
     init_process_group(backend='nccl')
+    # 初始化分布式环境 设置GPU
     torch.cuda.set_device(int(os.environ['LOCAL_RANK']))
 
+    # 检查配置 打印依赖版本
     if int(os.environ['RANK']) != 0:
         logger.remove()
 
+    # 检查配置是否合法
     check_config(config)
 
     logger.info(f'args: {args}')
@@ -209,6 +260,12 @@ def main(config):
     # Ensure only the main process creates directories
     if int(os.environ['RANK']) == 0:
         if 'save' in config:
+            if config.save.get('save_calib_json', False):
+                mkdirs(config.save.save_path)
+                save_calib_json_path = os.path.join(
+                    config.save.save_path,
+                    config.save.get('calib_json_name', 'calib_scales.json'),
+                )
             if config.save.get('save_trans', False):
                 save_trans_path = os.path.join(
                     config.save.save_path, 'transformed_model'
@@ -266,3 +323,4 @@ def main(config):
     llmc_duration_time = llmc_end_time - llmc_start_time
     logger.info(f'llmc_duration_time: {llmc_duration_time} s')
     logger.info('--- llmc finished ---')
+    
diff --git a/llmc/compression/quantization/base_blockwise_quantization.py b/llmc/compression/quantization/base_blockwise_quantization.py
index 5a2232699..3240fda7e 100755
--- a/llmc/compression/quantization/base_blockwise_quantization.py
+++ b/llmc/compression/quantization/base_blockwise_quantization.py
@@ -175,13 +175,17 @@ def set_quant_config(self):
                     self.act_quant_module = IntegerQuantizer
             elif quant_type == 'float-quant':
                 self.act_quant_module = FloatQuantizer
-            self.quant_config['act']['tp'] = self.tp
-            self.aquantizer = self.act_quant_module(**self.quant_config['act'])
             self.act_static = self.quant_config['act'].get('static', False)
             if self.act_static:
                 assert (
                     self.quant_config['act']['granularity'] == 'per_tensor'
                 ), 'Only support per_tensor static quant'
+                # 静态激活量化会走批量校准接口，这里把默认的 minmax
+                # 归一化成对应的 static_minmax，避免后续校准时报算法名不匹配。
+                if self.quant_config['act'].get('calib_algo', 'minmax') == 'minmax':
+                    self.quant_config['act']['calib_algo'] = 'static_minmax'
+            self.quant_config['act']['tp'] = self.tp
+            self.aquantizer = self.act_quant_module(**self.quant_config['act'])
             self.quant_attn = self.quant_config['act'].get('quant_attn', False)
             if self.quant_attn:
                 assert self.config['model']['type'] in ['Vit', 'DeepseekV2']
@@ -203,8 +207,10 @@ def set_quant_config(self):
             kv_special_cfg = self.quant_config['kvcache'].get('special', {})
             act_static_cfg = {}
             if self.act_static:
-                act_static_cfg.update(self.config.calib.n_sample)
-                act_static_cfg.update(self.config.calib.bs)
+                # KV cache 构造函数接收的是 num_samples / bsz，
+                # 这里把校准配置里的字段名映射成它实际需要的参数名。
+                act_static_cfg['num_samples'] = self.config.calib.n_sample
+                act_static_cfg['bsz'] = self.config.calib.bs
             kv_quant_type = self.quant_config['kvcache'].get('quant_type', 'int-quant')
             self.kv_module = KV_REGISTRY[self.quant_config['kvcache']['method']](
                 kv_quant_type, self.quant_config['kvcache'],
@@ -1003,6 +1009,111 @@ def contiguous_params(self):
                 if not param.is_contiguous():
                     param.data = param.data.contiguous()
 
+    # 将张量等对象转换成 JSON 可直接写出的 Python 基础类型。
+    def _to_jsonable(self, value):
+        if isinstance(value, torch.Tensor):
+            return value.detach().cpu().tolist()
+        return value
+
+    # 统一把输入规整成 CPU tensor，便于后续做范围计算和序列化。
+    def _to_tensor(self, value, dtype=torch.float32):
+        if isinstance(value, torch.Tensor):
+            return value.detach().cpu().to(dtype)
+        return torch.as_tensor(value, dtype=dtype)
+
+    # LightLLM 需要的是离线 FP8 KV 的 descale，这里先根据 qparams 还原实数范围，
+    # 再换算成与 torch.float8_e4m3fn 对齐的每层 K/V scale。
+    def _collect_lightllm_kv_scale(self, scales, zeros, qmin, qmax):
+        if isinstance(scales, torch.Tensor) and scales.numel() == 0:
+            return None
+
+        scales_tensor = self._to_tensor(scales)
+        zeros_tensor = self._to_tensor(zeros, dtype=scales_tensor.dtype)
+        qmin_tensor = self._to_tensor(qmin, dtype=scales_tensor.dtype)
+        qmax_tensor = self._to_tensor(qmax, dtype=scales_tensor.dtype)
+        min_tensor = (qmin_tensor - zeros_tensor) * scales_tensor
+        max_tensor = (qmax_tensor - zeros_tensor) * scales_tensor
+        absmax_tensor = torch.maximum(min_tensor.abs(), max_tensor.abs())
+        fp8_qmax = torch.tensor(
+            torch.finfo(torch.float8_e4m3fn).max, dtype=absmax_tensor.dtype
+        )
+        return absmax_tensor / fp8_qmax
+
+    # 按 LightLLM 的 kv_cache_calib.json 结构导出校准结果，
+    # 目前只支持它已经接入的 per_tensor / per_head 两种 KV 格式。
+    def collect_calib_json(self):
+        if not getattr(self, 'quant_kvcache', False):
+            raise ValueError('save_calib_json requires kvcache quantization.')
+
+        kv_cfg = self.quant_config['kvcache']
+        granularity = kv_cfg.get('granularity')
+        # LightLLM 当前只识别 per_tensor 和 per_head 两种静态 KV 校准文件。
+        if granularity not in ['per_tensor', 'per_head']:
+            raise ValueError(
+                f'LightLLM calib export only supports per_tensor/per_head, got {granularity}'
+            )
+
+        num_layers = self.model.model_config.num_hidden_layers
+        # LightLLM 会校验 KV head 数；如果模型配置里没有这个字段，再退回总 head 数。
+        num_head = int(
+            getattr(
+                self.model.model_config,
+                'num_key_value_heads',
+                self.model.get_num_attention_heads(),
+            )
+        )
+        scales = []
+        # 每层导出一行，顺序固定为 [k_scale..., v_scale...]。
+        for layer_idx in range(num_layers):
+            key_scale = self._collect_lightllm_kv_scale(
+                self.kv_module.k_scales_buffer[layer_idx],
+                self.kv_module.k_zeros_buffer[layer_idx],
+                self.kv_module.k_qmin_buffer[layer_idx],
+                self.kv_module.k_qmax_buffer[layer_idx],
+            )
+            value_scale = self._collect_lightllm_kv_scale(
+                self.kv_module.v_scales_buffer[layer_idx],
+                self.kv_module.v_zeros_buffer[layer_idx],
+                self.kv_module.v_qmin_buffer[layer_idx],
+                self.kv_module.v_qmax_buffer[layer_idx],
+            )
+            if key_scale is None or value_scale is None:
+                raise ValueError(f'Calibration scale for layer {layer_idx} is empty.')
+
+            scale_row = torch.cat([key_scale.reshape(-1), value_scale.reshape(-1)]).tolist()
+            scales.append(scale_row)
+
+        scale_width = len(scales[0]) if scales else 0
+        # per_tensor 每层只能有 [k_scale, v_scale] 两个值；
+        # per_head 则需要每层 2 * num_head 个值。
+        if granularity == 'per_tensor' and scale_width != 2:
+            raise ValueError(f'per_tensor export expects 2 scales per layer, got {scale_width}')
+        if granularity == 'per_head' and scale_width != num_head * 2:
+            raise ValueError(
+                f'per_head export expects {num_head * 2} scales per layer, got {scale_width}'
+            )
+
+        # 优先复用 Hugging Face config 里的 architectures 字段，
+        # 缺失时退回到 LLMC 配置里的模型类型，便于 LightLLM 做架构一致性校验。
+        architectures = getattr(self.model.model_config, 'architectures', None)
+        if isinstance(architectures, list) and len(architectures) > 0:
+            architectures = architectures[0]
+        elif architectures is None:
+            architectures = self.config.model.type
+
+        # 顶层字段名称和含义对齐 LightLLM PR #1220 中的 kv_cache_calib.json。
+        return {
+            'version': '1.0',
+            'architectures': architectures,
+            'quant_type': granularity,
+            'qmin': float(torch.finfo(torch.float8_e4m3fn).min),
+            'qmax': float(torch.finfo(torch.float8_e4m3fn).max),
+            'num_layers': num_layers,
+            'num_head': num_head,
+            'scales_shape': [num_layers, scale_width],
+            'scales': scales,
+        }
+
     @torch.no_grad()
     def save_model(self, path):
         if int(os.environ['RANK']) != 0:
diff --git a/llmc/compression/quantization/kvquant.py b/llmc/compression/quantization/kvquant.py
index 32c2de5be..6cbe75ffb 100644
--- a/llmc/compression/quantization/kvquant.py
+++ b/llmc/compression/quantization/kvquant.py
@@ -1,3 +1,4 @@
+import copy
 import torch
 from loguru import logger
 from transformers import DynamicCache
@@ -12,12 +13,20 @@ class NaiveQuantKVCache(DynamicCache):
     def __init__(self, quant_type, kvquant_cfg, num_hidden_layers, num_samples=128, bsz=1):
         super().__init__()
 
+        # 复制一份配置，避免在静态 KV 校准场景下修改原始量化配置对象。
+        kvquant_cfg = copy.deepcopy(kvquant_cfg)
         assert kvquant_cfg.granularity in ['per_token', 'per_tensor', 'per_group']
         self.num_hidden_layers, self.num_samples, self.bsz = (
             num_hidden_layers,
             num_samples,
             bsz,
         )
+        if kvquant_cfg.get('static', False) and kvquant_cfg.get(
+            'calib_algo', 'minmax'
+        ) == 'minmax':
+            # 静态 KV 校准会走批量张量统计接口，这里把默认的 minmax
+            # 归一化成对应的 static_minmax，避免后续校准时报算法名不匹配。
+            kvquant_cfg['calib_algo'] = 'static_minmax'
         if quant_type == 'int-quant':
             self.kvquantizer = IntegerQuantizer(**kvquant_cfg)
         elif quant_type == 'float-quant':