diff --git a/.github/workflows/reproducibility-check.yml b/.github/workflows/reproducibility-check.yml
new file mode 100644
index 0000000..896d8c9
--- /dev/null
+++ b/.github/workflows/reproducibility-check.yml
@@ -0,0 +1,42 @@
+name: Reproducibility Check
+
+on:
+  pull_request:
+    paths:
+      - 'reproducibility/**'
+      - 'examples/querygym_pyserini/pipeline.py'
+      - 'dataset_registry.yaml'
+  push:
+    branches: [main]
+    paths:
+      - 'reproducibility/**'
+      - 'examples/querygym_pyserini/pipeline.py'
+      - 'dataset_registry.yaml'
+
+concurrency:
+  group: repro-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[repro,dev]"
+          pip install pytest-cov
+
+      - name: Run repro tests
+        run: pytest reproducibility/tests -v --no-cov
+
+      - name: Aggregator --check
+        run: python -m reproducibility.scripts.aggregate_runs --check
diff --git a/MANIFEST.in b/MANIFEST.in
index 42f3f29..539e427 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -14,6 +14,11 @@ recursive-include examples *.py *.tsv *.ipynb
 # Include docs
 recursive-include docs *.md
 
+# Exclude reproducibility umbrella and (preemptive) future web/ from sdist.
+# These are repo-only artifacts; nothing in them belongs in the PyPI source dist.
+prune reproducibility
+prune web
+
 # Exclude development and build artifacts
 global-exclude __pycache__
 global-exclude *.py[co]
diff --git a/Makefile b/Makefile
index 8b4b20c..db89795 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help build build-cpu build-all test clean
+.PHONY: help build build-cpu build-all test clean repro-aggregate repro-check repro-test
 
 # Makefile for QueryGym Docker Development
 # This is for developers/contributors who need to build images locally
@@ -55,6 +55,16 @@ test:
 	@echo ""
 	@echo "✓ All tests passed!"
 
+# Reproducibility data pipeline
+repro-aggregate:
+	python -m reproducibility.scripts.aggregate_runs
+
+repro-check:
+	python -m reproducibility.scripts.aggregate_runs --check
+
+repro-test:
+	pytest reproducibility/tests -q --no-cov
+
 # Clean up locally built images
 clean:
 	@echo "Removing locally built images..."
diff --git a/docs/user-guide/reproducibility.md b/docs/user-guide/reproducibility.md
new file mode 100644
index 0000000..c825d22
--- /dev/null
+++ b/docs/user-guide/reproducibility.md
@@ -0,0 +1,82 @@
+# Reproducibility & Leaderboard Submissions
+
+QueryGym ships with a reproducibility pipeline that powers `leaderboard.querygym.com` and the SIGIR 2026 reproducibility paper. This page explains how to submit a result.
+
+The full schema lives at `reproducibility/schema.md` (human-readable) and `reproducibility/schema.json` (machine-readable). All submitted JSONs are validated against it three times: at emit time, at submit time, and at aggregate time in CI.
+
+## Trusted contributor flow
+
+If you have commit access:
+
+```bash
+# 1. Run the example pipeline.
+python examples/querygym_pyserini/pipeline.py \
+    --dataset msmarco-v1-passage.trecdl2019 \
+    --method query2e \
+    --model gpt-4.1-mini \
+    --output-dir outputs/dl19_query2e_zs
+
+# 2. Copy the output into the canonical layout.
+python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2e_zs
+
+# 3. Regenerate the aggregate CSV + manifest.
+make repro-aggregate
+
+# 4. Commit and open a PR.
+git add reproducibility/data/
+git commit -m "add query2e/gpt-4.1-mini result on dl19-passage"
+git push
+gh pr create
+```
+
+CI runs the schema/validator tests and `aggregate_runs.py --check`. If everything is green, the leaderboard rebuilds on merge.
+
+### Common failure modes
+
+| Symptom | Cause | Fix |
+|---|---|---|
+| `aggregator --check failed: results.csv is out of date` | You forgot step 3. | Run `make repro-aggregate`, commit the diff. |
+| `dataset_id 'foo' not in dataset_registry.yaml` | Typo or new dataset not registered. | Add the dataset to `dataset_registry.yaml` first, then re-submit. |
+| `method_id 'foo' not in registered methods` | Method not registered or name typo'd. | Register via `@register_method("foo")` in `querygym/methods/`. |
+| `params_hash mismatch` | The JSON was hand-edited. | Don't hand-edit run JSONs — re-run the emitter or use `submit_run` instead. |
+| `metric(s) ['bleu'] not in eval_metrics for dataset 'X'` | Unsupported metric for that dataset. | Either drop the metric or add it to the dataset's `output.eval_metrics` in the registry. |
+
+## External (fork) contributor flow
+
+If you don't have commit access:
+
+1. Fork `ls3-lab/QueryGym` on GitHub and clone your fork.
+2. Run steps 1–3 from the trusted flow above.
+3. Push to your fork and open a PR against `ls3-lab/QueryGym:main`.
+
+CI runs the same schema/validator/aggregator checks against your PR — no LLM keys or Pyserini are needed for these checks, so fork PRs get fast feedback.
+
+A maintainer will additionally **re-verify your numbers locally** before merging:
+
+- **Cheap pre-check (~30s):** the maintainer runs `pytrec_eval` against your submitted `run.txt` using the dataset's qrels and confirms the reported metrics match.
+- **Full re-run (only if needed):** if the cheap check is suspicious, the maintainer runs the example pipeline with your `config` block as inputs and compares reformulated queries + run file.
+
+This is why every submission must include `run.txt` and `reformulated_queries.tsv` alongside the JSON — they make verification cheap.
+
+## Verifying a published number (paper readers)
+
+Each leaderboard row links to the canonical files at a paper-release tag. To verify independently:
+
+```bash
+git clone --depth=1 --branch=paper-sigir2026 https://github.com/ls3-lab/QueryGym.git
+cd QueryGym
+
+# Pick a run.
+RUN_DIR=reproducibility/data/runs/msmarco-v1-passage.trecdl2019/query2e/gpt-4.1-mini
+
+# Re-run trec_eval against the public qrels (Pyserini ships them).
+python -m pyserini.eval.trec_eval -m ndcg_cut.10 dl19-passage "${RUN_DIR}"/*.run.txt
+```
+
+The number from `pyserini.eval.trec_eval` should match `metrics.ndcg_cut_10` in the corresponding JSON.
+
+## External tools (dashboard, third parties)
+
+The contract is `reproducibility/schema.json` — a Draft 2020-12 JSON Schema document. Any tool that emits a conformant JSON can submit (subject to the trusted vs. fork flows above). You don't need to import any Python from QueryGym; just read the schema file and validate locally with whatever JSON Schema library your stack provides (`Ajv` for JS, `jsonschema` for Python, `everit-org/json-schema` for Java).
+
+`schema_version` is `"const": 1` today. Bumping it to 2 will be a breaking change announced ahead of time.
diff --git a/examples/querygym_pyserini/pipeline.py b/examples/querygym_pyserini/pipeline.py
index 25660dd..3b66f4c 100755
--- a/examples/querygym_pyserini/pipeline.py
+++ b/examples/querygym_pyserini/pipeline.py
@@ -38,6 +38,102 @@
 from examples.querygym_pyserini import reformulate_queries
 from examples.querygym_pyserini import retrieve
 from examples.querygym_pyserini import evaluate
+from reproducibility.lib import build_run_summary, validate, ValidationError
+
+
+def _load_dataset_config_from_registry(dataset_id: str, registry_path: str) -> dict | None:
+    """Pull dataset_config fields from dataset_registry.yaml. None if not registered."""
+    try:
+        import yaml
+        with open(registry_path, 'r') as f:
+            registry = yaml.safe_load(f) or {}
+    except Exception:
+        return None
+    entry = registry.get('datasets', {}).get(dataset_id)
+    if not entry:
+        return None
+    return {
+        'topics': entry.get('topics', {}).get('name', ''),
+        'index': entry.get('index', {}).get('name', ''),
+        'num_queries': 0,  # filled by reformulation metadata when available
+        'bm25_weights': entry.get('bm25_weights', {'k1': 0.0, 'b': 0.0}),
+    }
+
+
+def _build_v1_summary(
+    *, results, dataset_name, method, model, method_params, llm_config,
+    steps, pipeline_time, registry_path, queries_file, index_name,
+) -> dict:
+    """Pull fields from per-step metadata and call reproducibility.lib.build_run_summary."""
+    reform = results.get('reformulation', {})
+    retrieval = results.get('retrieval', {})
+    evaluation = results.get('evaluation', {})
+
+    reform_inner = reform.get('reformulation', {}) if isinstance(reform, dict) else {}
+    dataset_inner = reform.get('dataset', {}) if isinstance(reform, dict) else {}
+
+    # Resolve dataset_config: prefer reformulation metadata (richest), fall back
+    # to the registry, then to file-based info.
+    if dataset_inner.get('topics') or dataset_inner.get('index'):
+        dataset_config = {
+            'topics': dataset_inner.get('topics') or '',
+            'index': dataset_inner.get('index') or (index_name or ''),
+            'num_queries': int(dataset_inner.get('num_queries') or 0),
+            'bm25_weights': dataset_inner.get('bm25_weights') or {'k1': 0.0, 'b': 0.0},
+        }
+    else:
+        dataset_config = _load_dataset_config_from_registry(dataset_name, registry_path) or {
+            'topics': '',
+            'index': index_name or '',
+            'num_queries': 0,
+            'bm25_weights': {'k1': 0.0, 'b': 0.0},
+        }
+        # If reformulation produced num_queries but no other config, splice it in.
+        if dataset_inner.get('num_queries'):
+            dataset_config['num_queries'] = int(dataset_inner['num_queries'])
+
+    # Searcher: use reformulation's searcher info if present, otherwise unknown.
+    searcher_info = reform_inner.get('searcher') or {}
+    searcher = {
+        'name': searcher_info.get('name') or 'unknown',
+        'type': searcher_info.get('type') or 'unknown',
+    }
+
+    # Effective method_params / llm_config: prefer the per-step metadata's view
+    # (already resolved with method-specific defaults), fall back to caller args.
+    eff_method_params = reform_inner.get('method_params') or dict(method_params or {})
+    # method_params may include a non-serializable searcher; strip it.
+    eff_method_params = {
+        k: v for k, v in eff_method_params.items()
+        if not callable(v) and not hasattr(v, '__dict__') or isinstance(v, (dict, list, str, int, float, bool, type(None)))
+    }
+    eff_llm_config = reform_inner.get('llm_config') or dict(llm_config or {})
+    # Keep only schema-relevant keys; build_run_summary accepts extras but the schema
+    # only requires temperature + max_tokens.
+    if 'temperature' not in eff_llm_config:
+        eff_llm_config['temperature'] = (llm_config or {}).get('temperature', 0.0)
+    if 'max_tokens' not in eff_llm_config:
+        eff_llm_config['max_tokens'] = (llm_config or {}).get('max_tokens', 1)
+
+    timing = {
+        'reformulation_seconds': float(reform.get('timing', {}).get('total_time_seconds', 0.0)),
+        'retrieval_seconds': float(retrieval.get('timing', {}).get('total_time_seconds', 0.0)),
+        'evaluation_seconds': float(evaluation.get('timing', {}).get('eval_time_seconds', 0.0)),
+    }
+
+    return build_run_summary(
+        dataset_id=dataset_name or queries_file.stem if queries_file else (dataset_name or 'unknown'),
+        method_id=method,
+        model=model,
+        method_params=eff_method_params,
+        llm_config=eff_llm_config,
+        searcher=searcher,
+        dataset_config=dataset_config,
+        metrics=evaluation.get('results', {}),
+        timing=timing,
+        steps_completed=steps,
+        total_time_seconds=pipeline_time,
+    )
 
 
 def run_pipeline(
@@ -189,22 +285,56 @@ def run_pipeline(
     
     # Pipeline complete
     pipeline_time = time.time() - pipeline_start
-    
-    # Save pipeline summary
-    summary = {
-        'pipeline': {
-            'dataset': dataset_name,
-            'method': method,
-            'model': model,
-            'steps_completed': steps,
-            'total_time_seconds': pipeline_time,
-            'formatted_time': format_time(pipeline_time)
-        },
-        'results': results
-    }
-    
+
+    # Build the canonical v1 run summary if the full pipeline ran with metrics.
+    # Partial runs (no evaluate step / no metrics) fall back to a debug summary
+    # that is NOT leaderboard-eligible.
+    eval_block = results.get('evaluation', {})
+    metrics = eval_block.get('results') if isinstance(eval_block, dict) else None
+
     summary_file = output_dir / 'pipeline_summary.json'
-    save_config(summary, summary_file)
+
+    if metrics:
+        payload = _build_v1_summary(
+            results=results,
+            dataset_name=dataset_name,
+            method=method,
+            model=model,
+            method_params=method_params,
+            llm_config=llm_config,
+            steps=steps,
+            pipeline_time=pipeline_time,
+            registry_path=registry_path,
+            queries_file=queries_file,
+            index_name=index_name,
+        )
+        try:
+            validate(payload)
+        except ValidationError as e:
+            logging.error(f"v1 schema validation failed: {e}")
+            raise
+        with open(summary_file, 'w') as f:
+            json.dump(payload, f, indent=2, sort_keys=False)
+            f.write('\n')
+        logging.info(f"v1 run summary written: {summary_file}")
+    else:
+        # Partial pipeline → legacy debug shape, different filename so submit_run
+        # doesn't accidentally pick it up.
+        partial_file = output_dir / 'pipeline_partial.json'
+        partial = {
+            'pipeline': {
+                'dataset': dataset_name,
+                'method': method,
+                'model': model,
+                'steps_completed': steps,
+                'total_time_seconds': pipeline_time,
+                'formatted_time': format_time(pipeline_time),
+            },
+            'results': results,
+            'note': 'partial pipeline; v1 summary not emitted (no metrics)',
+        }
+        save_config(partial, partial_file)
+        logging.info(f"partial summary written: {partial_file}")
     
     # Create human-readable summary
     summary_txt = output_dir / 'pipeline_summary.txt'
diff --git a/mkdocs.yml b/mkdocs.yml
index f3132ef..a3eea54 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -82,6 +82,7 @@ nav:
       - Docker Guide: user-guide/docker.md
       - Searcher Interface: user-guide/searcher.md
       - Prompt Bank: user-guide/prompts.md
+      - Reproducibility: user-guide/reproducibility.md
   - API Reference:
       - Core: api/core.md
       - Methods: api/methods.md
diff --git a/pyproject.toml b/pyproject.toml
index 9a8b356..6553242 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,11 +59,19 @@ dev = [
   "mkdocstrings[python]>=0.23.0",
 ]
 
+# Reproducibility tooling extras (aggregator, validator, leaderboard build helpers)
+repro = [
+  "pandas>=2.0.0",
+  "jsonschema>=4.20.0",
+]
+
 # All extras combined
 all = [
   "datasets>=2.20.0",
   "beir>=2.0.0",
   "pyserini>=0.22.0",
+  "pandas>=2.0.0",
+  "jsonschema>=4.20.0",
 ]
 
 [project.scripts]
@@ -97,7 +105,7 @@ ignore = ["E501"]  # Line too long (handled by black)
 
 # Pytest configuration
 [tool.pytest.ini_options]
-testpaths = ["tests"]
+testpaths = ["tests", "reproducibility/tests"]
 python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]
diff --git a/reproducibility/README.md b/reproducibility/README.md
new file mode 100644
index 0000000..ce163e2
--- /dev/null
+++ b/reproducibility/README.md
@@ -0,0 +1,73 @@
+# reproducibility/
+
+Everything that backs the QueryGym leaderboard and the SIGIR 2026 reproducibility paper lives here. The goal is one self-contained directory holding the schema, the data, the tooling, the tests, and (later) the leaderboard site.
+
+## What's in here
+
+| Path | What it is |
+|---|---|
+| `schema.json` | Canonical JSON Schema (Draft 2020-12) — the contract every run JSON must match. Read this from any language; it's not Python-specific. |
+| `schema.md` | Human-readable mirror of `schema.json` with field descriptions and a worked example. |
+| `lib/` | Private Python helpers used by this repo's tooling (the example pipeline, the aggregator, `submit_run`, the tests). Not a public API; external consumers should read `schema.json`. |
+| `data/runs/` | Per-run JSONs (canonical, citable). One JSON + sibling `.run.txt` + `.queries.tsv` per `(dataset, method, model, params)` cell. |
+| `data/results.csv` | Long-format aggregate, one row per `(run, metric)`. Derived; regenerated by `aggregate_runs.py`. |
+| `data/manifest.json` | Run count, row count, content hash, schema version. Derived. |
+| `scripts/aggregate_runs.py` | Walks `data/runs/`, validates each JSON, emits `results.csv` + `manifest.json`. Has a `--check` mode for CI. |
+| `scripts/submit_run.py` | Validates a fresh pipeline output dir and copies its files into the canonical `data/runs/` layout. |
+| `tests/` | Tests for `lib/`. Run via `pytest reproducibility/tests` or `make repro-test`. |
+| `site/` *(future)* | The leaderboard Astro app deployed to `leaderboard.querygym.com`. Not in this PR. |
+
+## Layout
+
+Runs are organized by **dataset → method → LLM**:
+
+```
+data/runs/{dataset_id}/{method_id}/{model}/{params_hash}.{json,run.txt,queries.tsv}
+```
+
+- `dataset_id` matches a key in `dataset_registry.yaml` at the repo root.
+- `method_id` matches a `@register_method(...)` name in `querygym/methods/`.
+- `model` is the LLM identifier (e.g. `gpt-4.1-mini`).
+- `params_hash` is an 8-char SHA-256 over `(method_id, model, method_params, llm_config)`. Two runs with the same config share a hash and overwrite each other intentionally; runs with different temperature get different hashes.
+
+## Submitting a run
+
+1. Run the example pipeline:
+   ```bash
+   python examples/querygym_pyserini/pipeline.py \
+       --dataset msmarco-v1-passage.trecdl2019 \
+       --method query2e \
+       --model gpt-4.1-mini \
+       --output-dir outputs/dl19_query2e
+   ```
+   The pipeline writes `pipeline_summary.json` (v1 schema) plus `runs/run.txt` and `queries/reformulated_queries.tsv` into the output dir.
+
+2. Copy into the canonical layout:
+   ```bash
+   python -m reproducibility.scripts.submit_run --from-dir outputs/dl19_query2e
+   ```
+
+3. Regenerate `results.csv` and `manifest.json`:
+   ```bash
+   make repro-aggregate
+   ```
+
+4. Commit both the new `data/runs/...` files and the updated `data/results.csv` + `data/manifest.json`.
+
+5. Open a PR. CI will:
+   - Run the schema/validator tests.
+   - Run `aggregate_runs.py --check` to confirm `results.csv` is in sync with `runs/`.
+
+For external (fork) submissions, a maintainer will additionally verify by re-running locally before merging — see `docs/user-guide/reproducibility.md` for details.
+
+## For external consumers (dashboard, third parties)
+
+The contract is `schema.json`. Read it from any tool that supports JSON Schema (Python `jsonschema`, JS `Ajv`, Java `everit-org/json-schema`, etc.) and emit conformant payloads from your own code. You do not need to import any Python from this repo.
+
+Bumping `schema_version` to a new integer is a breaking change; the field is `"const": 1` today and will be replaced (not extended) when v2 lands.
+
+## Why a separate top-level directory?
+
+The `querygym/` package on PyPI is the toolkit. This umbrella is the **reproducibility artifact** — schema, data, tooling, eventually the site — and it shouldn't bloat the wheel. `MANIFEST.in` prunes it from the sdist; `pyproject.toml`'s `tool.setuptools.packages.find` only includes `querygym*`.
+
+The example pipeline in `examples/querygym_pyserini/` imports `reproducibility.lib` because it runs from a clone of this repo; `pip install querygym` users don't need it.
diff --git a/reproducibility/__init__.py b/reproducibility/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/reproducibility/data/manifest.json b/reproducibility/data/manifest.json
new file mode 100644
index 0000000..19f9959
--- /dev/null
+++ b/reproducibility/data/manifest.json
@@ -0,0 +1,8 @@
+{
+  "content_hash": "8f32fb55fdd189d27cfaa021fc81e55652fe80e151083c24fde1cce7ef99341f",
+  "generated_at": "2026-04-29T19:52:22Z",
+  "querygym_version": "unknown",
+  "row_count": 0,
+  "run_count": 0,
+  "schema_version": 1
+}
diff --git a/reproducibility/data/results.csv b/reproducibility/data/results.csv
new file mode 100644
index 0000000..1a89e7a
--- /dev/null
+++ b/reproducibility/data/results.csv
@@ -0,0 +1 @@
+schema_version,run_id,dataset_id,method_id,model,params_hash,method_params_json,llm_temperature,llm_max_tokens,metric,value,num_queries,total_time_seconds,querygym_version,run_file_path
diff --git a/reproducibility/data/runs/.gitkeep b/reproducibility/data/runs/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/reproducibility/schema.json b/reproducibility/schema.json
new file mode 100644
index 0000000..a272997
--- /dev/null
+++ b/reproducibility/schema.json
@@ -0,0 +1,159 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://querygym.com/schemas/run-summary/v1.json",
+  "title": "QueryGym Run Summary v1",
+  "description": "Canonical per-run metrics document emitted by QueryGym pipelines and consumed by the leaderboard.",
+  "type": "object",
+  "additionalProperties": false,
+  "required": [
+    "schema_version",
+    "run_id",
+    "params_hash",
+    "submitted_at",
+    "querygym_version",
+    "environment",
+    "pipeline",
+    "config",
+    "metrics",
+    "timing",
+    "artifacts"
+  ],
+  "properties": {
+    "schema_version": {
+      "const": 1,
+      "description": "Schema version. Bumping is a breaking change."
+    },
+    "run_id": {
+      "type": "string",
+      "pattern": "^[0-9a-f]{16}$",
+      "description": "Hash over the full payload minus volatile fields. Identifies a specific execution."
+    },
+    "params_hash": {
+      "type": "string",
+      "pattern": "^[0-9a-f]{8}$",
+      "description": "Hash over the tuning surface (method_id, model, method_params, llm_config). Doubles as filename."
+    },
+    "submitted_at": {
+      "type": "string",
+      "format": "date-time",
+      "description": "ISO 8601 UTC timestamp of JSON generation. Excluded from run_id hash."
+    },
+    "querygym_version": {
+      "type": "string",
+      "description": "querygym package __version__ at emit time."
+    },
+    "environment": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["python_version", "platform"],
+      "properties": {
+        "python_version": {"type": "string"},
+        "platform": {"type": "string"},
+        "git_commit": {"type": ["string", "null"]}
+      }
+    },
+    "pipeline": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["dataset_id", "method_id", "model", "steps_completed", "total_time_seconds"],
+      "properties": {
+        "dataset_id": {
+          "type": "string",
+          "description": "Must match a key in dataset_registry.yaml; enforced at validate time."
+        },
+        "method_id": {
+          "type": "string",
+          "description": "Must match a name registered via @register_method; enforced at validate time."
+        },
+        "model": {
+          "type": "string",
+          "description": "LLM model identifier (e.g. gpt-4.1-mini, qwen2.5:7b)."
+        },
+        "steps_completed": {
+          "type": "array",
+          "items": {"enum": ["reformulate", "retrieve", "evaluate"]},
+          "uniqueItems": true,
+          "minItems": 1
+        },
+        "total_time_seconds": {"type": "number", "minimum": 0}
+      }
+    },
+    "config": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["method_params", "llm_config", "searcher", "dataset_config"],
+      "properties": {
+        "method_params": {
+          "type": "object",
+          "description": "Method-specific parameters. Free-form per method, included in params_hash."
+        },
+        "llm_config": {
+          "type": "object",
+          "additionalProperties": true,
+          "required": ["temperature", "max_tokens"],
+          "properties": {
+            "temperature": {"type": "number"},
+            "max_tokens": {"type": "integer", "minimum": 1},
+            "top_p": {"type": "number"}
+          }
+        },
+        "searcher": {
+          "type": "object",
+          "additionalProperties": false,
+          "required": ["name", "type"],
+          "properties": {
+            "name": {"type": "string"},
+            "type": {"type": "string"}
+          }
+        },
+        "dataset_config": {
+          "type": "object",
+          "additionalProperties": false,
+          "required": ["topics", "index", "num_queries", "bm25_weights"],
+          "properties": {
+            "topics": {"type": "string"},
+            "index": {"type": "string"},
+            "num_queries": {"type": "integer", "minimum": 0},
+            "bm25_weights": {
+              "type": "object",
+              "additionalProperties": false,
+              "required": ["k1", "b"],
+              "properties": {
+                "k1": {"type": "number"},
+                "b": {"type": "number"}
+              }
+            }
+          }
+        }
+      }
+    },
+    "metrics": {
+      "type": "object",
+      "minProperties": 1,
+      "additionalProperties": {"type": "number"},
+      "description": "Flat metric_name -> value. Keys must be a subset of the dataset's eval_metrics (with dots normalized to underscores); enforced at validate time."
+    },
+    "timing": {
+      "type": "object",
+      "additionalProperties": {"type": "number", "minimum": 0},
+      "description": "Per-step seconds. Common keys: reformulation_seconds, retrieval_seconds, evaluation_seconds."
+    },
+    "artifacts": {
+      "type": "object",
+      "additionalProperties": false,
+      "required": ["run_file", "reformulated_queries"],
+      "properties": {
+        "run_file": {
+          "type": "string",
+          "pattern": "^[0-9a-f]{8}\\.run\\.txt$",
+          "description": "Sibling TREC run file. Filename must equal {params_hash}.run.txt."
+        },
+        "reformulated_queries": {
+          "type": "string",
+          "pattern": "^[0-9a-f]{8}\\.queries\\.tsv$",
+          "description": "Sibling reformulated queries TSV. Filename must equal {params_hash}.queries.tsv."
+        }
+      }
+    }
+  }
+}
diff --git a/reproducibility/schema.md b/reproducibility/schema.md
new file mode 100644
index 0000000..04d48ee
--- /dev/null
+++ b/reproducibility/schema.md
@@ -0,0 +1,117 @@
+# Run Summary Schema (v1)
+
+This document mirrors `reproducibility/schema.json` in human-readable form. Both files are kept in sync via `reproducibility/tests/test_repro_schema.py`, which embeds the same canonical fixture used here.
+
+## Top-level fields
+
+| Field | Type | Required | Description |
+|---|---|---|---|
+| `schema_version` | `int` | yes | Always `1`. Bumping is a breaking change. |
+| `run_id` | `string` (16-char hex) | yes | SHA-256 prefix over the payload minus volatile fields. Identifies a specific execution. |
+| `params_hash` | `string` (8-char hex) | yes | SHA-256 prefix over `(method_id, model, method_params, llm_config)`. Doubles as the on-disk filename. |
+| `submitted_at` | ISO 8601 UTC | yes | Wall-clock time the JSON was generated. Excluded from `run_id`. |
+| `querygym_version` | `string` | yes | `querygym.__version__` at emit time. |
+| `environment` | object | yes | Python version, platform, optional git commit. |
+| `pipeline` | object | yes | dataset_id, method_id, model, steps_completed, total_time_seconds. |
+| `config` | object | yes | method_params, llm_config, searcher, dataset_config. |
+| `metrics` | object | yes | Flat `metric_name -> float`. Must have ≥1 entry. |
+| `timing` | object | yes | Per-step seconds. |
+| `artifacts` | object | yes | Sibling `run_file` and `reformulated_queries` filenames. |
+
+## Validation rules (beyond the static schema)
+
+These are enforced by `reproducibility.lib.validate(...)` at runtime:
+
+1. `pipeline.dataset_id` must be a key in `dataset_registry.yaml`.
+2. `pipeline.method_id` must be registered via `@register_method(...)` in `querygym/methods/`.
+3. Each `metrics` key must be in the dataset's `output.eval_metrics` (after normalizing dots to underscores: `ndcg_cut.10` → `ndcg_cut_10`).
+4. `params_hash` is recomputed from `(method_id, model, method_params, llm_config)` and must equal the stored value.
+5. `run_id` is recomputed from the payload (minus `run_id`, `submitted_at`, `environment`) and must equal the stored value.
+6. `artifacts.run_file` must equal `{params_hash}.run.txt`; `artifacts.reformulated_queries` must equal `{params_hash}.queries.tsv`.
+
+Hand-editing a metric value without re-running the emitter will fail validation (rule 5). This catches silent tampering.
+
+## Hashing details
+
+```python
+def compute_params_hash(method_id, model, method_params, llm_config) -> str:
+    payload = {"method_id": ..., "model": ..., "method_params": ..., "llm_config": ...}
+    return sha256(json.dumps(payload, sort_keys=True, separators=(",",":"))).hexdigest()[:8]
+
+def compute_run_id(payload) -> str:
+    stripped = {k: v for k, v in payload.items() if k not in ("run_id", "submitted_at", "environment")}
+    return sha256(json.dumps(stripped, sort_keys=True, separators=(",",":"))).hexdigest()[:16]
+```
+
+`json.dumps(..., sort_keys=True)` makes hashes invariant to key ordering. Hashes change when any field they cover changes.
+
+## Canonical example
+
+This is `reproducibility/tests/fixtures/sample_run.json` — used by tests, embedded here, and produced from the inputs in `test_repro_schema._build_kwargs()`:
+
+```json
+{
+  "schema_version": 1,
+  "run_id": "cabe83ca1236a3bb",
+  "params_hash": "ddb15ccf",
+  "submitted_at": "2026-04-29T10:14:22Z",
+  "querygym_version": "0.3.0",
+  "environment": {
+    "python_version": "3.10.13",
+    "platform": "Linux-5.15.0-x86_64",
+    "git_commit": "5c46a51"
+  },
+  "pipeline": {
+    "dataset_id": "msmarco-v1-passage.trecdl2019",
+    "method_id": "query2e",
+    "model": "gpt-4.1-mini",
+    "steps_completed": ["reformulate", "retrieve", "evaluate"],
+    "total_time_seconds": 89.37
+  },
+  "config": {
+    "method_params": {"mode": "zs"},
+    "llm_config": {"temperature": 1.0, "max_tokens": 128, "top_p": 1.0},
+    "searcher": {"name": "UserPyseriniWrapper", "type": "user_pyserini"},
+    "dataset_config": {
+      "topics": "dl19-passage",
+      "index": "msmarco-v1-passage",
+      "num_queries": 43,
+      "bm25_weights": {"k1": 0.9, "b": 0.4}
+    }
+  },
+  "metrics": {
+    "map": 0.3709,
+    "ndcg_cut_10": 0.5679,
+    "recall_1000": 0.8384
+  },
+  "timing": {
+    "reformulation_seconds": 65.24,
+    "retrieval_seconds": 3.01,
+    "evaluation_seconds": 10.53
+  },
+  "artifacts": {
+    "run_file": "ddb15ccf.run.txt",
+    "reformulated_queries": "ddb15ccf.queries.tsv"
+  }
+}
+```
+
+## On-disk layout
+
+A run lives under:
+
+```
+reproducibility/data/runs/{dataset_id}/{method_id}/{model}/{params_hash}.{json,run.txt,queries.tsv}
+```
+
+The three sibling files together describe one run completely. The `.run.txt` is a TREC-format retrieval run that allows independent re-evaluation with `pytrec_eval`; the `.queries.tsv` lets reviewers spot-check reformulations.
+
+## Bumping the schema
+
+Future schema changes require:
+1. Bumping `SCHEMA_VERSION` in `reproducibility/lib/emit.py` to 2.
+2. Updating `schema.json`'s `schema_version.const` to 2.
+3. Re-emitting all existing JSONs under v2 (one bulk PR).
+4. Updating the dashboard product to consume v2.
+
+The schema is intentionally hard to change so that the leaderboard's history stays comparable.
diff --git a/reproducibility/scripts/__init__.py b/reproducibility/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/reproducibility/scripts/aggregate_runs.py b/reproducibility/scripts/aggregate_runs.py
new file mode 100644
index 0000000..0c1016a
--- /dev/null
+++ b/reproducibility/scripts/aggregate_runs.py
@@ -0,0 +1,268 @@
+"""Walk reproducibility/data/runs/, validate each JSON, emit results.csv + manifest.json.
+
+Deterministic by design: sorted rows, fixed column order, LF line endings, sorted JSON.
+The committed CSV must equal the output of this script for any given runs/ tree, which
+the CI workflow enforces via --check.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import hashlib
+import io
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Iterator
+
+# Make `reproducibility.lib` importable when invoked as a script from the repo root.
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+from reproducibility.lib import SCHEMA_VERSION, validate, ValidationError  # noqa: E402
+
+DATA_DIR = _REPO_ROOT / "reproducibility" / "data"
+RUNS_DIR = DATA_DIR / "runs"
+RESULTS_CSV = DATA_DIR / "results.csv"
+MANIFEST_JSON = DATA_DIR / "manifest.json"
+
+CSV_COLUMNS = [
+    "schema_version",
+    "run_id",
+    "dataset_id",
+    "method_id",
+    "model",
+    "params_hash",
+    "method_params_json",
+    "llm_temperature",
+    "llm_max_tokens",
+    "metric",
+    "value",
+    "num_queries",
+    "total_time_seconds",
+    "querygym_version",
+    "run_file_path",
+]
+
+
+def _iter_run_files(runs_dir: Path) -> Iterator[Path]:
+    yield from sorted(runs_dir.rglob("*.json"))
+
+
+def _load_and_validate(path: Path, dataset_registry, method_registry) -> dict:
+    with path.open("r", encoding="utf-8") as f:
+        payload = json.load(f)
+    try:
+        validate(
+            payload,
+            dataset_registry=dataset_registry,
+            method_registry=method_registry,
+        )
+    except ValidationError as e:
+        raise SystemExit(f"validation failed for {path}: {e}") from e
+    return payload
+
+
+def _payload_to_rows(payload: dict, run_path: Path) -> list[list]:
+    """One row per metric. Returns rows in CSV_COLUMNS order."""
+    pipeline = payload["pipeline"]
+    config = payload["config"]
+    rel_path = run_path.relative_to(_REPO_ROOT).as_posix()
+
+    base = [
+        payload["schema_version"],
+        payload["run_id"],
+        pipeline["dataset_id"],
+        pipeline["method_id"],
+        pipeline["model"],
+        payload["params_hash"],
+        json.dumps(config["method_params"], sort_keys=True, separators=(",", ":")),
+        config["llm_config"]["temperature"],
+        config["llm_config"]["max_tokens"],
+        # metric / value filled per row below
+        None,
+        None,
+        config["dataset_config"]["num_queries"],
+        pipeline["total_time_seconds"],
+        payload["querygym_version"],
+        rel_path,
+    ]
+
+    rows = []
+    for metric in sorted(payload["metrics"].keys()):
+        row = list(base)
+        row[9] = metric
+        row[10] = payload["metrics"][metric]
+        rows.append(row)
+    return rows
+
+
+def _write_csv(rows: list[list]) -> str:
+    """Render CSV to a string with deterministic settings."""
+    buf = io.StringIO(newline="")
+    # csv.writer with QUOTE_MINIMAL + LF is deterministic across platforms.
+    writer = csv.writer(buf, lineterminator="\n", quoting=csv.QUOTE_MINIMAL)
+    writer.writerow(CSV_COLUMNS)
+    writer.writerows(rows)
+    return buf.getvalue()
+
+
+def _content_hash(text: str) -> str:
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+def _querygym_version() -> str:
+    try:
+        import querygym  # type: ignore
+        return getattr(querygym, "__version__", "unknown")
+    except ImportError:
+        return "unknown"
+
+
+def aggregate(runs_dir: Path) -> tuple[str, dict]:
+    """Build the canonical CSV text and a manifest dict from runs_dir.
+
+    Returns (csv_text, manifest) — both deterministic for a given runs_dir.
+    """
+    # Lazy load registries once for the whole walk (saves file IO per run).
+    from reproducibility.lib.validate import (
+        _load_dataset_registry,
+        _load_method_registry,
+    )
+
+    dataset_registry = _load_dataset_registry(None)
+    method_registry = list(_load_method_registry())
+
+    all_rows: list[list] = []
+    run_count = 0
+    for run_path in _iter_run_files(runs_dir):
+        payload = _load_and_validate(run_path, dataset_registry, method_registry)
+        all_rows.extend(_payload_to_rows(payload, run_path))
+        run_count += 1
+
+    # Sort by (dataset_id, method_id, model, params_hash, metric) for stable diffs.
+    sort_idx = (
+        CSV_COLUMNS.index("dataset_id"),
+        CSV_COLUMNS.index("method_id"),
+        CSV_COLUMNS.index("model"),
+        CSV_COLUMNS.index("params_hash"),
+        CSV_COLUMNS.index("metric"),
+    )
+    all_rows.sort(key=lambda r: tuple(r[i] for i in sort_idx))
+
+    csv_text = _write_csv(all_rows)
+    manifest = {
+        "schema_version": SCHEMA_VERSION,
+        "querygym_version": _querygym_version(),
+        "run_count": run_count,
+        "row_count": len(all_rows),
+        "content_hash": _content_hash(csv_text),
+    }
+    return csv_text, manifest
+
+
+def _read_committed_files() -> tuple[str | None, dict | None]:
+    csv_text = RESULTS_CSV.read_text(encoding="utf-8") if RESULTS_CSV.exists() else None
+    manifest = (
+        json.loads(MANIFEST_JSON.read_text(encoding="utf-8"))
+        if MANIFEST_JSON.exists()
+        else None
+    )
+    return csv_text, manifest
+
+
+def cmd_write(runs_dir: Path) -> int:
+    csv_text, manifest = aggregate(runs_dir)
+    manifest["generated_at"] = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    RESULTS_CSV.write_text(csv_text, encoding="utf-8")
+    MANIFEST_JSON.write_text(
+        json.dumps(manifest, indent=2, sort_keys=True) + "\n",
+        encoding="utf-8",
+    )
+    print(
+        f"wrote {RESULTS_CSV.relative_to(_REPO_ROOT)} "
+        f"({manifest['run_count']} runs, {manifest['row_count']} rows, "
+        f"content_hash={manifest['content_hash'][:12]}...)"
+    )
+    return 0
+
+
+def cmd_check(runs_dir: Path) -> int:
+    csv_text, manifest = aggregate(runs_dir)
+    committed_csv, committed_manifest = _read_committed_files()
+
+    failures = []
+
+    if committed_csv is None:
+        failures.append(f"{RESULTS_CSV.relative_to(_REPO_ROOT)} is missing")
+    elif committed_csv != csv_text:
+        failures.append(
+            f"{RESULTS_CSV.relative_to(_REPO_ROOT)} is out of date "
+            f"(committed != regenerated)"
+        )
+
+    if committed_manifest is None:
+        failures.append(f"{MANIFEST_JSON.relative_to(_REPO_ROOT)} is missing")
+    else:
+        # Compare everything except generated_at (which is intentionally volatile).
+        for key in ("schema_version", "querygym_version", "run_count", "row_count", "content_hash"):
+            committed_val = committed_manifest.get(key)
+            fresh_val = manifest.get(key)
+            if committed_val != fresh_val:
+                failures.append(
+                    f"{MANIFEST_JSON.relative_to(_REPO_ROOT)}: {key} mismatch "
+                    f"(committed={committed_val!r}, regenerated={fresh_val!r})"
+                )
+
+    if failures:
+        print("Aggregator --check failed:", file=sys.stderr)
+        for f in failures:
+            print(f"  - {f}", file=sys.stderr)
+        print(
+            "\nFix by running:\n"
+            "  python -m reproducibility.scripts.aggregate_runs\n"
+            "and committing the diff.",
+            file=sys.stderr,
+        )
+        return 1
+
+    print(
+        f"OK: {manifest['run_count']} runs, {manifest['row_count']} rows, "
+        f"content_hash={manifest['content_hash'][:12]}..."
+    )
+    return 0
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Aggregate run JSONs into results.csv + manifest.json."
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Verify committed files match what the aggregator would produce. "
+        "Exits non-zero on mismatch. Used by CI.",
+    )
+    parser.add_argument(
+        "--runs-dir",
+        type=Path,
+        default=RUNS_DIR,
+        help=f"Directory to walk for run JSONs (default: {RUNS_DIR.relative_to(_REPO_ROOT)}).",
+    )
+    args = parser.parse_args()
+
+    if not args.runs_dir.exists():
+        args.runs_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.check:
+        return cmd_check(args.runs_dir)
+    return cmd_write(args.runs_dir)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/reproducibility/scripts/submit_run.py b/reproducibility/scripts/submit_run.py
new file mode 100644
index 0000000..4ed6612
--- /dev/null
+++ b/reproducibility/scripts/submit_run.py
@@ -0,0 +1,154 @@
+"""Validate a fresh run JSON and copy it (plus sibling artifacts) into the canonical layout.
+
+Used by:
+- Internal trusted contributors after running the example pipeline.
+- External fork contributors before opening a PR.
+- The one-time SIGIR backfill (re-emitting legacy JSONs under v1).
+
+The example pipeline writes pipeline_summary.json (v1 schema) plus run.txt and
+reformulated_queries.tsv to its --output-dir. This script picks those up and lays them
+into reproducibility/data/runs/{dataset_id}/{method_id}/{model}/{params_hash}.{ext}.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import sys
+from pathlib import Path
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+from reproducibility.lib import validate, ValidationError  # noqa: E402
+
+DEFAULT_RUNS_DIR = _REPO_ROOT / "reproducibility" / "data" / "runs"
+
+# Common filenames the example pipeline produces in --output-dir.
+SUMMARY_CANDIDATES = ("pipeline_summary.json",)
+RUN_FILE_CANDIDATES = ("runs/run.txt", "run.txt")
+QUERIES_CANDIDATES = (
+    "queries/reformulated_queries.tsv",
+    "reformulated_queries.tsv",
+)
+
+
+def _find(from_dir: Path, candidates: tuple[str, ...]) -> Path | None:
+    for c in candidates:
+        p = from_dir / c
+        if p.exists():
+            return p
+    return None
+
+
+def _resolve_inputs(from_dir: Path) -> tuple[Path, Path, Path]:
+    summary = _find(from_dir, SUMMARY_CANDIDATES)
+    if summary is None:
+        raise SystemExit(
+            f"could not find pipeline_summary.json under {from_dir}. "
+            f"Did the pipeline complete?"
+        )
+    run_file = _find(from_dir, RUN_FILE_CANDIDATES)
+    if run_file is None:
+        raise SystemExit(
+            f"could not find run.txt under {from_dir} (looked in: {RUN_FILE_CANDIDATES})"
+        )
+    queries = _find(from_dir, QUERIES_CANDIDATES)
+    if queries is None:
+        raise SystemExit(
+            f"could not find reformulated_queries.tsv under {from_dir} "
+            f"(looked in: {QUERIES_CANDIDATES})"
+        )
+    return summary, run_file, queries
+
+
+def _canonical_dir(runs_dir: Path, payload: dict) -> Path:
+    p = payload["pipeline"]
+    return runs_dir / p["dataset_id"] / p["method_id"] / p["model"]
+
+
+def _copy(src: Path, dst: Path, *, force: bool) -> None:
+    if dst.exists() and not force:
+        raise SystemExit(
+            f"refusing to overwrite {dst.relative_to(_REPO_ROOT)} (use --force)."
+        )
+    dst.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(src, dst)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Validate a run output directory and copy its files into the canonical "
+            "reproducibility/data/runs/ layout."
+        )
+    )
+    parser.add_argument(
+        "--from-dir",
+        type=Path,
+        required=True,
+        help="Directory produced by examples/querygym_pyserini/pipeline.py.",
+    )
+    parser.add_argument(
+        "--runs-dir",
+        type=Path,
+        default=DEFAULT_RUNS_DIR,
+        help=f"Target runs directory (default: {DEFAULT_RUNS_DIR.relative_to(_REPO_ROOT)}).",
+    )
+    parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Overwrite an existing run with the same params_hash.",
+    )
+    parser.add_argument(
+        "--skip-registry-checks",
+        action="store_true",
+        help="Skip dataset/method registry validation (use only for synthetic test runs).",
+    )
+    args = parser.parse_args()
+
+    if not args.from_dir.is_dir():
+        raise SystemExit(f"--from-dir does not exist: {args.from_dir}")
+
+    summary, run_file, queries = _resolve_inputs(args.from_dir)
+
+    with summary.open("r", encoding="utf-8") as f:
+        payload = json.load(f)
+
+    try:
+        validate(payload, skip_registry_checks=args.skip_registry_checks)
+    except ValidationError as e:
+        raise SystemExit(f"validation failed for {summary}: {e}") from e
+
+    target_dir = _canonical_dir(args.runs_dir, payload)
+    h = payload["params_hash"]
+
+    json_dst = target_dir / f"{h}.json"
+    run_dst = target_dir / f"{h}.run.txt"
+    queries_dst = target_dir / f"{h}.queries.tsv"
+
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    # Write the validated payload (not a verbatim copy of summary) — guarantees
+    # the on-disk JSON is byte-identical to what the validator just OK'd.
+    if json_dst.exists() and not args.force:
+        raise SystemExit(
+            f"refusing to overwrite {json_dst.relative_to(_REPO_ROOT)} (use --force)."
+        )
+    with json_dst.open("w", encoding="utf-8") as f:
+        json.dump(payload, f, indent=2, sort_keys=False)
+        f.write("\n")
+
+    _copy(run_file, run_dst, force=args.force)
+    _copy(queries, queries_dst, force=args.force)
+
+    rel = json_dst.relative_to(_REPO_ROOT).as_posix()
+    print(f"wrote {rel}")
+    print("Now run:\n  make repro-aggregate\nand commit the diff.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/reproducibility/tests/__init__.py b/reproducibility/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/reproducibility/tests/fixtures/sample_run.json b/reproducibility/tests/fixtures/sample_run.json
new file mode 100644
index 0000000..debc31b
--- /dev/null
+++ b/reproducibility/tests/fixtures/sample_run.json
@@ -0,0 +1,60 @@
+{
+  "schema_version": 1,
+  "run_id": "cabe83ca1236a3bb",
+  "params_hash": "ddb15ccf",
+  "submitted_at": "2026-04-29T10:14:22Z",
+  "querygym_version": "0.3.0",
+  "environment": {
+    "python_version": "3.10.13",
+    "platform": "Linux-5.15.0-x86_64",
+    "git_commit": "5c46a51"
+  },
+  "pipeline": {
+    "dataset_id": "msmarco-v1-passage.trecdl2019",
+    "method_id": "query2e",
+    "model": "gpt-4.1-mini",
+    "steps_completed": [
+      "reformulate",
+      "retrieve",
+      "evaluate"
+    ],
+    "total_time_seconds": 89.37
+  },
+  "config": {
+    "method_params": {
+      "mode": "zs"
+    },
+    "llm_config": {
+      "temperature": 1.0,
+      "max_tokens": 128,
+      "top_p": 1.0
+    },
+    "searcher": {
+      "name": "UserPyseriniWrapper",
+      "type": "user_pyserini"
+    },
+    "dataset_config": {
+      "topics": "dl19-passage",
+      "index": "msmarco-v1-passage",
+      "num_queries": 43,
+      "bm25_weights": {
+        "k1": 0.9,
+        "b": 0.4
+      }
+    }
+  },
+  "metrics": {
+    "map": 0.3709,
+    "ndcg_cut_10": 0.5679,
+    "recall_1000": 0.8384
+  },
+  "timing": {
+    "reformulation_seconds": 65.24,
+    "retrieval_seconds": 3.01,
+    "evaluation_seconds": 10.53
+  },
+  "artifacts": {
+    "run_file": "ddb15ccf.run.txt",
+    "reformulated_queries": "ddb15ccf.queries.tsv"
+  }
+}
diff --git a/reproducibility/tests/test_repro_schema.py b/reproducibility/tests/test_repro_schema.py
new file mode 100644
index 0000000..c7d2c99
--- /dev/null
+++ b/reproducibility/tests/test_repro_schema.py
@@ -0,0 +1,216 @@
+"""Tests for reproducibility.lib (emit + validate)."""
+
+from __future__ import annotations
+
+import copy
+import json
+from pathlib import Path
+
+import pytest
+
+from reproducibility.lib import (
+    SCHEMA_VERSION,
+    ValidationError,
+    build_run_summary,
+    compute_params_hash,
+    compute_run_id,
+    validate,
+)
+
+FIXTURE = Path(__file__).parent / "fixtures" / "sample_run.json"
+
+
+def _load_fixture() -> dict:
+    with FIXTURE.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def _build_kwargs() -> dict:
+    """Inputs that produce the canonical fixture, modulo volatile fields."""
+    return dict(
+        dataset_id="msmarco-v1-passage.trecdl2019",
+        method_id="query2e",
+        model="gpt-4.1-mini",
+        method_params={"mode": "zs"},
+        llm_config={"temperature": 1.0, "max_tokens": 128, "top_p": 1.0},
+        searcher={"name": "UserPyseriniWrapper", "type": "user_pyserini"},
+        dataset_config={
+            "topics": "dl19-passage",
+            "index": "msmarco-v1-passage",
+            "num_queries": 43,
+            "bm25_weights": {"k1": 0.9, "b": 0.4},
+        },
+        metrics={"map": 0.3709, "ndcg_cut_10": 0.5679, "recall_1000": 0.8384},
+        timing={
+            "reformulation_seconds": 65.24,
+            "retrieval_seconds": 3.01,
+            "evaluation_seconds": 10.53,
+        },
+        steps_completed=["reformulate", "retrieve", "evaluate"],
+        total_time_seconds=89.37,
+        # Pin volatile fields for determinism in tests.
+        submitted_at="2026-04-29T10:14:22Z",
+        environment={
+            "python_version": "3.10.13",
+            "platform": "Linux-5.15.0-x86_64",
+            "git_commit": "5c46a51",
+        },
+        querygym_version="0.3.0",
+    )
+
+
+# ---------- Hash properties --------------------------------------------------
+
+
+def test_params_hash_is_8_hex():
+    h = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"t": 1.0})
+    assert len(h) == 8
+    assert all(c in "0123456789abcdef" for c in h)
+
+
+def test_params_hash_is_stable():
+    a = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 1.0})
+    b = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 1.0})
+    assert a == b
+
+
+def test_params_hash_changes_on_temperature():
+    a = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 1.0})
+    b = compute_params_hash("query2e", "gpt-4.1-mini", {"mode": "zs"}, {"temperature": 0.5})
+    assert a != b
+
+
+def test_params_hash_invariant_to_key_order():
+    a = compute_params_hash("query2e", "gpt-4.1-mini", {"a": 1, "b": 2}, {"x": 1, "y": 2})
+    b = compute_params_hash("query2e", "gpt-4.1-mini", {"b": 2, "a": 1}, {"y": 2, "x": 1})
+    assert a == b
+
+
+def test_run_id_excludes_volatile_fields():
+    payload = _load_fixture()
+    rid_a = compute_run_id(payload)
+    payload2 = copy.deepcopy(payload)
+    payload2["submitted_at"] = "2099-12-31T23:59:59Z"
+    payload2["environment"] = {"python_version": "9.9", "platform": "any", "git_commit": None}
+    rid_b = compute_run_id(payload2)
+    assert rid_a == rid_b
+
+
+def test_run_id_changes_on_metric_change():
+    payload = _load_fixture()
+    rid_a = compute_run_id(payload)
+    payload2 = copy.deepcopy(payload)
+    payload2["metrics"]["map"] = 0.9999
+    rid_b = compute_run_id(payload2)
+    assert rid_a != rid_b
+
+
+# ---------- build_run_summary ------------------------------------------------
+
+
+def test_build_run_summary_matches_fixture():
+    """The fixture should be exactly what build_run_summary produces from canonical inputs."""
+    built = build_run_summary(**_build_kwargs())
+    fixture = _load_fixture()
+    assert built == fixture
+
+
+def test_build_run_summary_validates_clean():
+    payload = build_run_summary(**_build_kwargs())
+    validate(payload)
+
+
+def test_build_run_summary_artifact_filenames_use_params_hash():
+    payload = build_run_summary(**_build_kwargs())
+    h = payload["params_hash"]
+    assert payload["artifacts"]["run_file"] == f"{h}.run.txt"
+    assert payload["artifacts"]["reformulated_queries"] == f"{h}.queries.tsv"
+
+
+# ---------- Validator: schema-level rejections -------------------------------
+
+
+def test_validator_rejects_missing_schema_version():
+    payload = _load_fixture()
+    del payload["schema_version"]
+    with pytest.raises(ValidationError, match="schema_version"):
+        validate(payload)
+
+
+def test_validator_rejects_wrong_schema_version():
+    payload = _load_fixture()
+    payload["schema_version"] = 2
+    with pytest.raises(ValidationError):
+        validate(payload)
+
+
+def test_validator_rejects_extra_top_level_field():
+    payload = _load_fixture()
+    payload["whoops"] = "extra"
+    with pytest.raises(ValidationError):
+        validate(payload)
+
+
+def test_validator_rejects_malformed_artifact_filename():
+    payload = _load_fixture()
+    payload["artifacts"]["run_file"] = "not-a-hash.run.txt"
+    with pytest.raises(ValidationError, match="artifacts"):
+        validate(payload)
+
+
+# ---------- Validator: registry-level rejections -----------------------------
+
+
+def test_validator_rejects_unknown_dataset():
+    payload = _load_fixture()
+    payload["pipeline"]["dataset_id"] = "fake-dataset"
+    # Recompute hashes so we hit the registry check, not the hash check.
+    payload["run_id"] = compute_run_id(payload)
+    with pytest.raises(ValidationError, match="dataset_id 'fake-dataset'"):
+        validate(payload)
+
+
+def test_validator_rejects_metric_outside_eval_metrics():
+    payload = _load_fixture()
+    payload["metrics"]["bleu"] = 0.5
+    payload["run_id"] = compute_run_id(payload)
+    with pytest.raises(ValidationError, match="not in eval_metrics"):
+        validate(payload)
+
+
+# ---------- Validator: hash-level rejections ---------------------------------
+
+
+def test_validator_rejects_tampered_params_hash():
+    payload = _load_fixture()
+    payload["params_hash"] = "deadbeef"
+    payload["run_id"] = compute_run_id(payload)
+    with pytest.raises(ValidationError, match="params_hash mismatch"):
+        validate(payload)
+
+
+def test_validator_rejects_tampered_run_id():
+    payload = _load_fixture()
+    payload["run_id"] = "0" * 16
+    with pytest.raises(ValidationError, match="run_id mismatch"):
+        validate(payload)
+
+
+def test_validator_rejects_silent_metric_edit():
+    """Hand-editing a metric value without recomputing run_id must be caught."""
+    payload = _load_fixture()
+    payload["metrics"]["map"] = 0.9999  # leave run_id alone
+    with pytest.raises(ValidationError, match="run_id mismatch"):
+        validate(payload)
+
+
+# ---------- Skip-registry-checks escape hatch (for tests with synthetic ids) ---
+
+
+def test_validator_skip_registry_checks_allows_unknown_ids():
+    """build_run_summary with an unknown dataset still validates if registry checks are off."""
+    kwargs = _build_kwargs()
+    kwargs["dataset_id"] = "synthetic-test-dataset"
+    payload = build_run_summary(**kwargs)
+    # Schema/hash checks still pass; registry check is skipped.
+    validate(payload, skip_registry_checks=True)