ls3-lab · radinhamidi · Apr 29, 2026 · Apr 29, 2026
diff --git a/.github/workflows/reproducibility-check.yml b/.github/workflows/reproducibility-check.yml
@@ -0,0 +1,42 @@
+name: Reproducibility Check
+
+on:
+  pull_request:
+    paths:
+      - 'reproducibility/**'
+      - 'examples/querygym_pyserini/pipeline.py'
+      - 'dataset_registry.yaml'
+  push:
+    branches: [main]
+    paths:
+      - 'reproducibility/**'
+      - 'examples/querygym_pyserini/pipeline.py'
+      - 'dataset_registry.yaml'
+
+concurrency:
+  group: repro-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[repro,dev]"
+          pip install pytest-cov
+
+      - name: Run repro tests
+        run: pytest reproducibility/tests -v --no-cov
+
+      - name: Aggregator --check
+        run: python -m reproducibility.scripts.aggregate_runs --check
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -14,6 +14,11 @@ recursive-include examples *.py *.tsv *.ipynb
 # Include docs
 recursive-include docs *.md
 
+# Exclude reproducibility umbrella and (preemptive) future web/ from sdist.
+# These are repo-only artifacts; nothing in them belongs in the PyPI source dist.
+prune reproducibility
+prune web
+
 # Exclude development and build artifacts
 global-exclude __pycache__
 global-exclude *.py[co]

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help build build-cpu build-all test clean
+.PHONY: help build build-cpu build-all test clean repro-aggregate repro-check repro-test
 
 # Makefile for QueryGym Docker Development
 # This is for developers/contributors who need to build images locally
@@ -55,6 +55,16 @@ test:
 	@echo ""
 	@echo "✓ All tests passed!"
 
+# Reproducibility data pipeline
+repro-aggregate:
+	python -m reproducibility.scripts.aggregate_runs
+
+repro-check:
+	python -m reproducibility.scripts.aggregate_runs --check
+
+repro-test:
+	pytest reproducibility/tests -q --no-cov
+
 # Clean up locally built images
 clean:
 	@echo "Removing locally built images..."

diff --git a/docs/user-guide/reproducibility.md b/docs/user-guide/reproducibility.md
@@ -0,0 +1,82 @@
+# Reproducibility & Leaderboard Submissions
+
+QueryGym ships with a reproducibility pipeline that powers `leaderboard.querygym.com` and the SIGIR 2026 reproducibility paper. This page explains how to submit a result.
+
+The full schema lives at `reproducibility/schema.md` (human-readable) and `reproducibility/schema.json` (machine-readable). All submitted JSONs are validated against it three times: at emit time, at submit time, and at aggregate time in CI.
+
+## Trusted contributor flow
+
+If you have commit access:
+
+```bash
+# 1. Run the example pipeline.
+python examples/querygym_pyserini/pipeline.py \
+    --dataset msmarco-v1-passage.trecdl2019 \
+    --method query2e \
+    --model gpt-4.1-mini \
+    --output-dir outputs/dl19_query2e_zs
+
+# 2. Copy the output into the canonical layout.
+python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2e_zs
+
+# 3. Regenerate the aggregate CSV + manifest.
+make repro-aggregate
+
+# 4. Commit and open a PR.
+git add reproducibility/data/
+git commit -m "add query2e/gpt-4.1-mini result on dl19-passage"
+git push
+gh pr create
+```
+
+CI runs the schema/validator tests and `aggregate_runs.py --check`. If everything is green, the leaderboard rebuilds on merge.
+
+### Common failure modes
+
+| Symptom | Cause | Fix |
+|---|---|---|
+| `aggregator --check failed: results.csv is out of date` | You forgot step 3. | Run `make repro-aggregate`, commit the diff. |
+| `dataset_id 'foo' not in dataset_registry.yaml` | Typo or new dataset not registered. | Add the dataset to `dataset_registry.yaml` first, then re-submit. |
+| `method_id 'foo' not in registered methods` | Method not registered or name typo'd. | Register via `@register_method("foo")` in `querygym/methods/`. |
+| `params_hash mismatch` | The JSON was hand-edited. | Don't hand-edit run JSONs — re-run the emitter or use `submit_run` instead. |
+| `metric(s) ['bleu'] not in eval_metrics for dataset 'X'` | Unsupported metric for that dataset. | Either drop the metric or add it to the dataset's `output.eval_metrics` in the registry. |
+
+## External (fork) contributor flow
+
+If you don't have commit access:
+
+1. Fork `ls3-lab/QueryGym` on GitHub and clone your fork.
+2. Run steps 1–3 from the trusted flow above.
+3. Push to your fork and open a PR against `ls3-lab/QueryGym:main`.
+
+CI runs the same schema/validator/aggregator checks against your PR — no LLM keys or Pyserini are needed for these checks, so fork PRs get fast feedback.
+
+A maintainer will additionally **re-verify your numbers locally** before merging:
+
+- **Cheap pre-check (~30s):** the maintainer runs `pytrec_eval` against your submitted `run.txt` using the dataset's qrels and confirms the reported metrics match.
+- **Full re-run (only if needed):** if the cheap check is suspicious, the maintainer runs the example pipeline with your `config` block as inputs and compares reformulated queries + run file.
+
+This is why every submission must include `run.txt` and `reformulated_queries.tsv` alongside the JSON — they make verification cheap.
+
+## Verifying a published number (paper readers)
+
+Each leaderboard row links to the canonical files at a paper-release tag. To verify independently:
+
+```bash
+git clone --depth=1 --branch=paper-sigir2026 https://github.com/ls3-lab/QueryGym.git
+cd QueryGym
+
+# Pick a run.
+RUN_DIR=reproducibility/data/runs/msmarco-v1-passage.trecdl2019/query2e/gpt-4.1-mini
+
+# Re-run trec_eval against the public qrels (Pyserini ships them).
+python -m pyserini.eval.trec_eval -m ndcg_cut.10 dl19-passage "${RUN_DIR}"/*.run.txt
+```
+
+The number from `pyserini.eval.trec_eval` should match `metrics.ndcg_cut_10` in the corresponding JSON.
+
+## External tools (dashboard, third parties)
+
+The contract is `reproducibility/schema.json` — a Draft 2020-12 JSON Schema document. Any tool that emits a conformant JSON can submit (subject to the trusted vs. fork flows above). You don't need to import any Python from QueryGym; just read the schema file and validate locally with whatever JSON Schema library your stack provides (`Ajv` for JS, `jsonschema` for Python, `everit-org/json-schema` for Java).
+
+`schema_version` is `"const": 1` today. Bumping it to 2 will be a breaking change announced ahead of time.
diff --git a/examples/querygym_pyserini/pipeline.py b/examples/querygym_pyserini/pipeline.py
@@ -38,6 +38,102 @@
 from examples.querygym_pyserini import reformulate_queries
 from examples.querygym_pyserini import retrieve
 from examples.querygym_pyserini import evaluate
+from reproducibility.lib import build_run_summary, validate, ValidationError
+
+
+def _load_dataset_config_from_registry(dataset_id: str, registry_path: str) -> dict | None:
+    """Pull dataset_config fields from dataset_registry.yaml. None if not registered."""
+    try:
+        import yaml
+        with open(registry_path, 'r') as f:
+            registry = yaml.safe_load(f) or {}
+    except Exception:
+        return None
+    entry = registry.get('datasets', {}).get(dataset_id)
+    if not entry:
+        return None
+    return {
+        'topics': entry.get('topics', {}).get('name', ''),
+        'index': entry.get('index', {}).get('name', ''),
+        'num_queries': 0,  # filled by reformulation metadata when available
+        'bm25_weights': entry.get('bm25_weights', {'k1': 0.0, 'b': 0.0}),
+    }
+
+
+def _build_v1_summary(
+    *, results, dataset_name, method, model, method_params, llm_config,
+    steps, pipeline_time, registry_path, queries_file, index_name,
+) -> dict:
+    """Pull fields from per-step metadata and call reproducibility.lib.build_run_summary."""
+    reform = results.get('reformulation', {})
+    retrieval = results.get('retrieval', {})
+    evaluation = results.get('evaluation', {})
+
+    reform_inner = reform.get('reformulation', {}) if isinstance(reform, dict) else {}
+    dataset_inner = reform.get('dataset', {}) if isinstance(reform, dict) else {}
+
+    # Resolve dataset_config: prefer reformulation metadata (richest), fall back
+    # to the registry, then to file-based info.
+    if dataset_inner.get('topics') or dataset_inner.get('index'):
+        dataset_config = {
+            'topics': dataset_inner.get('topics') or '',
+            'index': dataset_inner.get('index') or (index_name or ''),
+            'num_queries': int(dataset_inner.get('num_queries') or 0),
+            'bm25_weights': dataset_inner.get('bm25_weights') or {'k1': 0.0, 'b': 0.0},
+        }
+    else:
+        dataset_config = _load_dataset_config_from_registry(dataset_name, registry_path) or {
+            'topics': '',
+            'index': index_name or '',
+            'num_queries': 0,
+            'bm25_weights': {'k1': 0.0, 'b': 0.0},
+        }
+        # If reformulation produced num_queries but no other config, splice it in.
+        if dataset_inner.get('num_queries'):
+            dataset_config['num_queries'] = int(dataset_inner['num_queries'])
+
+    # Searcher: use reformulation's searcher info if present, otherwise unknown.
+    searcher_info = reform_inner.get('searcher') or {}
+    searcher = {
+        'name': searcher_info.get('name') or 'unknown',
+        'type': searcher_info.get('type') or 'unknown',
+    }
+
+    # Effective method_params / llm_config: prefer the per-step metadata's view
+    # (already resolved with method-specific defaults), fall back to caller args.
+    eff_method_params = reform_inner.get('method_params') or dict(method_params or {})
+    # method_params may include a non-serializable searcher; strip it.
+    eff_method_params = {
+        k: v for k, v in eff_method_params.items()
+        if not callable(v) and not hasattr(v, '__dict__') or isinstance(v, (dict, list, str, int, float, bool, type(None)))
+    }
+    eff_llm_config = reform_inner.get('llm_config') or dict(llm_config or {})
+    # Keep only schema-relevant keys; build_run_summary accepts extras but the schema
+    # only requires temperature + max_tokens.
+    if 'temperature' not in eff_llm_config:
+        eff_llm_config['temperature'] = (llm_config or {}).get('temperature', 0.0)
+    if 'max_tokens' not in eff_llm_config:
+        eff_llm_config['max_tokens'] = (llm_config or {}).get('max_tokens', 1)
+
+    timing = {
+        'reformulation_seconds': float(reform.get('timing', {}).get('total_time_seconds', 0.0)),
+        'retrieval_seconds': float(retrieval.get('timing', {}).get('total_time_seconds', 0.0)),
+        'evaluation_seconds': float(evaluation.get('timing', {}).get('eval_time_seconds', 0.0)),
+    }
+
+    return build_run_summary(
+        dataset_id=dataset_name or queries_file.stem if queries_file else (dataset_name or 'unknown'),
+        method_id=method,
+        model=model,
+        method_params=eff_method_params,
+        llm_config=eff_llm_config,
+        searcher=searcher,
+        dataset_config=dataset_config,
+        metrics=evaluation.get('results', {}),
+        timing=timing,
+        steps_completed=steps,
+        total_time_seconds=pipeline_time,
+    )
 
 
 def run_pipeline(
@@ -189,22 +285,56 @@ def run_pipeline(
 
     # Pipeline complete
     pipeline_time = time.time() - pipeline_start
-
-    # Save pipeline summary
-    summary = {
-        'pipeline': {
-            'dataset': dataset_name,
-            'method': method,
-            'model': model,
-            'steps_completed': steps,
-            'total_time_seconds': pipeline_time,
-            'formatted_time': format_time(pipeline_time)
-        },
-        'results': results
-    }
-
+
+    # Build the canonical v1 run summary if the full pipeline ran with metrics.
+    # Partial runs (no evaluate step / no metrics) fall back to a debug summary
+    # that is NOT leaderboard-eligible.
+    eval_block = results.get('evaluation', {})
+    metrics = eval_block.get('results') if isinstance(eval_block, dict) else None
+
     summary_file = output_dir / 'pipeline_summary.json'
-    save_config(summary, summary_file)
+
+    if metrics:
+        payload = _build_v1_summary(
+            results=results,
+            dataset_name=dataset_name,
+            method=method,
+            model=model,
+            method_params=method_params,
+            llm_config=llm_config,
+            steps=steps,
+            pipeline_time=pipeline_time,
+            registry_path=registry_path,
+            queries_file=queries_file,
+            index_name=index_name,
+        )
+        try:
+            validate(payload)
+        except ValidationError as e:
+            logging.error(f"v1 schema validation failed: {e}")
+            raise
+        with open(summary_file, 'w') as f:
+            json.dump(payload, f, indent=2, sort_keys=False)
+            f.write('\n')
+        logging.info(f"v1 run summary written: {summary_file}")
+    else:
+        # Partial pipeline → legacy debug shape, different filename so submit_run
+        # doesn't accidentally pick it up.
+        partial_file = output_dir / 'pipeline_partial.json'
+        partial = {
+            'pipeline': {
+                'dataset': dataset_name,
+                'method': method,
+                'model': model,
+                'steps_completed': steps,
+                'total_time_seconds': pipeline_time,
+                'formatted_time': format_time(pipeline_time),
+            },
+            'results': results,
+            'note': 'partial pipeline; v1 summary not emitted (no metrics)',
+        }
+        save_config(partial, partial_file)
+        logging.info(f"partial summary written: {partial_file}")
 
     # Create human-readable summary
     summary_txt = output_dir / 'pipeline_summary.txt'

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -82,6 +82,7 @@ nav:
       - Docker Guide: user-guide/docker.md
       - Searcher Interface: user-guide/searcher.md
       - Prompt Bank: user-guide/prompts.md
+      - Reproducibility: user-guide/reproducibility.md
   - API Reference:
       - Core: api/core.md
       - Methods: api/methods.md

diff --git a/pyproject.toml b/pyproject.toml
@@ -59,11 +59,19 @@ dev = [
   "mkdocstrings[python]>=0.23.0",
 ]
 
+# Reproducibility tooling extras (aggregator, validator, leaderboard build helpers)
+repro = [
+  "pandas>=2.0.0",
+  "jsonschema>=4.20.0",
+]
+
 # All extras combined
 all = [
   "datasets>=2.20.0",
   "beir>=2.0.0",
   "pyserini>=0.22.0",
+  "pandas>=2.0.0",
+  "jsonschema>=4.20.0",
 ]
 
 [project.scripts]
@@ -97,7 +105,7 @@ ignore = ["E501"]  # Line too long (handled by black)
 
 # Pytest configuration
 [tool.pytest.ini_options]
-testpaths = ["tests"]
+testpaths = ["tests", "reproducibility/tests"]
 python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]